From c21104465ed9a6953c23f20ae737c3d1c68c187d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 1 Dec 2022 22:27:18 +0200
Subject: [PATCH 001/167] Fix copying relation in walloged create database in
 PG15 (#2986)

refer #2904
---
 pgxn/neon/pagestore_smgr.c | 43 +++++++++++++-------------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index a8dde3927a..434a1c2b85 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1102,7 +1102,7 @@ PageIsEmptyHeapPage(char *buffer)
 }
 
 static void
-neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
 {
 	XLogRecPtr	lsn = PageGetLSN(buffer);
 
@@ -1116,7 +1116,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
+	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
 	{
 		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;
@@ -1125,30 +1125,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		XLogFlush(recptr);
 		lsn = recptr;
 		ereport(SmgrTrace,
-				(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
-						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}
-	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
-	{
-		/*
-		 * Always WAL-log vm. We should never miss clearing visibility map
-		 * bits.
-		 *
-		 * TODO Is it too bad for performance? Hopefully we do not evict
-		 * actively used vm too often.
-		 */
-		XLogRecPtr	recptr;
-
-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
-		XLogFlush(recptr);
-		lsn = recptr;
-
-		ereport(SmgrTrace,
-				(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
+				(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
 						blocknum,
 						reln->smgr_rnode.node.spcNode,
 						reln->smgr_rnode.node.dbNode,
@@ -1543,6 +1520,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
+	BlockNumber	n_blocks = 0;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1582,7 +1560,16 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 					 errhint("This limit is defined by neon.max_cluster_size GUC")));
 	}
 
-	neon_wallog_page(reln, forkNum, blkno, buffer);
+	/*
+	 * Usually Postgres doesn't extend relation on more than one page
+	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
+	 * call smgrextend for destination relation n using size of source relation
+	 */
+	get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
+	while (n_blocks < blkno)
+		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
+
+	neon_wallog_page(reln, forkNum, blkno, buffer, false);
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
@@ -2010,7 +1997,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	neon_wallog_page(reln, forknum, blocknum, buffer);
+	neon_wallog_page(reln, forknum, blocknum, buffer, false);
 
 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",

From d90b52b4054036dacb8ee8d387098bde9652326c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 1 Dec 2022 22:51:48 +0200
Subject: [PATCH 002/167] Update README

- Change "WAL service" to "safekeepers" in the architecture section. The
  safekeepers together form the WAL service, but we don't use that term
  much in the code.
- Replace the short list of pageserver components with a link /docs. We
  have more details there.
- Add "Other resources" to Documention section, with links to some blog
  posts and a video presentation.
- Remove notice at the top about the Zenith -> Neon rename. There are
  still a few references to Zenith in the codebase, but not so many that
  we would need to call it out at the top anymore.
---
 README.md | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index cda36008d8..1b8c28518e 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,6 @@
 
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
 
-The project used to be called "Zenith". Many of the commands and code comments
-still refer to "zenith", but we are in the process of renaming things.
-
 ## Quick start
 [Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
 
@@ -12,19 +9,13 @@ Alternatively, compile and run the project [locally](#running-local-installation
 
 ## Architecture overview
 
-A Neon installation consists of compute nodes and a Neon storage engine.
-
-Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
+A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
 
 The Neon storage engine consists of two major components:
 - Pageserver. Scalable storage backend for the compute nodes.
-- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.
+- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
 
-Pageserver consists of:
-- Repository - Neon storage implementation.
-- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
-- Page service - service that communicates with compute nodes and responds with pages from the repository.
-- WAL redo - service that builds pages from base images and WAL records on Page service request
+See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
 
 ## Running local installation
 
@@ -229,12 +220,20 @@ CARGO_BUILD_FLAGS="--features=testing" make
 
 ## Documentation
 
-Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.
+[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
 
 - [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
 
 To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
 
+See also README files in some source directories, and `rustdoc` style documentation comments.
+
+Other resources:
+
+- [SELECT 'Hello, World'](https://neon.tech/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture
+- [Architecture decisions in Neon](https://neon.tech/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas
+- [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series
+
 ### Postgres-specific terms
 
 Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.

From 145e7e4b960620216ea0478af38d22c3c21220d6 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 2 Dec 2022 13:35:01 +0100
Subject: [PATCH 003/167] Prefetch cleanup: (#2876)

- **Enable `enable_seqscan_prefetch` by default**
- Drop use of `seqscan_prefetch_buffers` in favor of
`[maintenance,effective]_io_concurrency`
This includes adding some fields to the HeapScan execution node, and
vacuum state.
- Cleanup some conditionals in vacuumlazy.c
- Clarify enable_seqscan_prefetch GUC description
- Fix issues in heap SeqScan prefetching where synchronize_seqscan
machinery wasn't handled properly.
---
 .github/workflows/benchmarking.yml |  3 ++-
 pgxn/neon/libpagestore.c           | 12 ++++++------
 vendor/postgres-v14                |  2 +-
 vendor/postgres-v15                |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index a5cf4b4694..860e5d72b6 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -232,7 +232,8 @@ jobs:
       if: matrix.platform == 'neon-captest-prefetch'
       run: |
         psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET seqscan_prefetch_buffers=10"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET maintenance_io_concurrency=32"
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index ae8275168d..1aba2e1ede 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -464,12 +464,12 @@ pg_init_libpagestore(void)
 							NULL, NULL, NULL);
 	DefineCustomIntVariable("neon.readahead_buffer_size",
 							"number of prefetches to buffer",
-							"This buffer is used to store prefetched data; so "
-							"it is important that this buffer is at least as "
-							"large as the configured value of all tablespaces' "
-							"effective_io_concurrency and maintenance_io_concurrency, "
-							"your sessions' values of these, and the value for "
-							"seqscan_prefetch_buffers.",
+							"This buffer is used to hold and manage prefetched "
+							"data; so it is important that this buffer is at "
+							"least as large as the configured value of all "
+							"tablespaces' effective_io_concurrency and "
+							"maintenance_io_concurrency, and your sessions' "
+							"values for these settings.",
 							&readahead_buffer_size,
 							128, 16, 1024,
 							PGC_USERSET,
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index da50d99db5..06edb5af61 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit da50d99db54848f7a3e910f920aaad7dc6915d36
+Subproject commit 06edb5af6180f99ee1bd6903bae2898d2ea128ef
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 780c3f8e35..edf4c161dd 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 780c3f8e3524c2e32a2e28884c7b647fcebf71d7
+Subproject commit edf4c161dd0182d22c28297e841ca253bc1b8ee4

From 788823ebe3cb1cce3a53d85e28a82184b47ae20d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 2 Dec 2022 17:59:26 +0000
Subject: [PATCH 004/167] Fix named_arguments_used_positionally warnings
 (#2987)

```
warning: named argument `file` is not used by name
    --> pageserver/src/tenant/timeline.rs:1078:54
     |
1078 |                 trace!("downloading image file: {}", file = path.display());
     |                                                 --   ^^^^ this named argument is referred to by position in formatting string
     |                                                 |
     |                                                 this formatting argument uses named argument `file` by position
     |
     = note: `#[warn(named_arguments_used_positionally)]` on by default
help: use the named argument by name to avoid ambiguity
     |
1078 |                 trace!("downloading image file: {file}", file = path.display());
     |                                                  ++++

```

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3dd6b1f9d6..06ddff05ea 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1075,7 +1075,7 @@ impl Timeline {
                     continue;
                 }
 
-                trace!("downloading image file: {}", file = path.display());
+                trace!("downloading image file: {}", path.display());
                 let sz = remote_client
                     .download_layer_file(&RelativePath::from_filename(path), &layer_metadata)
                     .await
@@ -1105,7 +1105,7 @@ impl Timeline {
                     continue;
                 }
 
-                trace!("downloading image file: {}", file = path.display());
+                trace!("downloading delta file: {}", path.display());
                 let sz = remote_client
                     .download_layer_file(&RelativePath::from_filename(path), &layer_metadata)
                     .await

From ed27c9802249c54ddcf9f5c171b71f7e587e3ba9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 3 Dec 2022 13:11:02 +0000
Subject: [PATCH 005/167] Nightly Benchmarks: use new prefetch settings (#3000)

- Replace `seqscan_prefetch_buffers` with `effective_io_concurrency` and
`maintenance_io_concurrency` for `clickbench-compare` job (see
https://github.com/neondatabase/neon/pull/2876)
- Get the database name in a runtime (it can be `main` or `neondb` or
something else)
---
 .github/workflows/benchmarking.yml | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 860e5d72b6..ec2bea9058 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -231,9 +231,11 @@ jobs:
     - name: Set database options
       if: matrix.platform == 'neon-captest-prefetch'
       run: |
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET effective_io_concurrency=32"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET maintenance_io_concurrency=32"
+        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
+
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
@@ -375,8 +377,11 @@ jobs:
     - name: Set database options
       if: matrix.platform == 'neon-captest-prefetch'
       run: |
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10"
+        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
+
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 

From 4f443c339dde1d2cdd87c7836ca1a8a985036257 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 3 Dec 2022 17:30:55 +0200
Subject: [PATCH 006/167] Tone down retry error logs (#2999)

Closes https://github.com/neondatabase/neon/issues/2990
---
 pageserver/src/storage_sync2.rs | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 0b17c3fc42..94216747ef 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -204,7 +204,7 @@ use std::sync::{Arc, Mutex};
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use tokio::runtime::Runtime;
-use tracing::{error, info, warn};
+use tracing::{info, warn};
 use tracing::{info_span, Instrument};
 
 use utils::lsn::Lsn;
@@ -888,10 +888,20 @@ impl RemoteTimelineClient {
                 Err(e) => {
                     let retries = task.retries.fetch_add(1, Ordering::SeqCst);
 
-                    error!(
-                        "failed to perform remote task {}, will retry (attempt {}): {:?}",
-                        task.op, retries, e
-                    );
+                    // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
+                    // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
+                    // people and tests until the retries are definitely causing delays.
+                    if retries < 3 {
+                        info!(
+                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            task.op, retries, e
+                        );
+                    } else {
+                        warn!(
+                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            task.op, retries, e
+                        );
+                    }
 
                     // sleep until it's time to retry, or we're cancelled
                     tokio::select! {

From ab073696d09aed09bf27a516450535511756c8c7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Dec 2022 10:56:01 +0000
Subject: [PATCH 007/167] test_bulk_update: use new prefetch settings (#3007)

Replace `seqscan_prefetch_buffers` with `effective_io_concurrency` &
`maintenance_io_concurrency` in one more place (the last one!)
---
 test_runner/performance/test_bulk_update.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py
index bcd26013e5..f8e29cda69 100644
--- a/test_runner/performance/test_bulk_update.py
+++ b/test_runner/performance/test_bulk_update.py
@@ -42,7 +42,8 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
 
     cur.execute("drop table t")
     cur.execute("set enable_seqscan_prefetch=on")
-    cur.execute("set seqscan_prefetch_buffers=100")
+    cur.execute("set effective_io_concurrency=32")
+    cur.execute("set maintenance_io_concurrency=32")
 
     cur.execute(f"create table t2(x integer) WITH (fillfactor={fillfactor})")
 

From 79fdd3d51b0b597e6f3956b7dc9e3c30955c3372 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 5 Dec 2022 13:56:04 +0200
Subject: [PATCH 008/167] Fix #2907: Change missing_layers property to optional
 in the IndexPart struct (#3005)

Move missing_layers property to Option<HashSet<RelativePath>>
This will allow the safe removal of it once the upgrade of all page servers is done with this new code
---
 pageserver/src/storage_sync2/index.rs | 45 ++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index b1f43dcb93..2d5f3b1d54 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -104,7 +104,8 @@ pub struct IndexPart {
 
     /// FIXME: unused field. This should be removed, but that changes the on-disk format,
     /// so we need to make sure we're backwards- (and maybe forwards-) compatible
-    missing_layers: HashSet<RelativePath>,
+    /// First pass is to move it to Optional and the next would be its removal
+    missing_layers: Option<HashSet<RelativePath>>,
 
     /// Per layer file metadata, which can be present for a present or missing layer file.
     ///
@@ -145,7 +146,7 @@ impl IndexPart {
         Self {
             version: Self::LATEST_VERSION,
             timeline_layers,
-            missing_layers: HashSet::new(),
+            missing_layers: Some(HashSet::new()),
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -199,7 +200,7 @@ mod tests {
         let expected = IndexPart {
             version: 0,
             timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
-            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            missing_layers: Some([RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect()),
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -227,7 +228,7 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
-            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            missing_layers: Some([RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect()),
             layer_metadata: HashMap::from([
                 (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
                     file_size: Some(25600000),
@@ -245,4 +246,40 @@ mod tests {
         let part = serde_json::from_str::<IndexPart>(example).unwrap();
         assert_eq!(part, expected);
     }
+
+    #[test]
+    fn v1_indexpart_is_parsed_with_optional_missing_layers() {
+        let example = r#"{
+            "version":1,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 1,
+            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
+            layer_metadata: HashMap::from([
+                (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
+                    file_size: Some(25600000),
+                }),
+                (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: Some(9007199254741001),
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            missing_layers: None::<HashSet<RelativePath>>,
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }

From 38af4535537fb97056ea934502c1f0909b678e36 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 5 Dec 2022 22:48:45 +0200
Subject: [PATCH 009/167] Use async RwLock around tenants (#3009)

A step towards more async code in our repo, to help avoid most of the
odd blocking calls, that might deadlock, as mentioned in
https://github.com/neondatabase/neon/issues/2975
---
 pageserver/src/bin/pageserver.rs |  18 +++-
 pageserver/src/http/routes.rs    | 157 +++++++++++++++----------------
 pageserver/src/page_service.rs   |   2 +-
 pageserver/src/tenant_mgr.rs     | 146 +++++++++++-----------------
 pageserver/src/tenant_tasks.rs   |   2 +-
 5 files changed, 149 insertions(+), 176 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 32d3fca47c..6c774ae1ae 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -297,10 +297,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         })
         .transpose()
         .context("Failed to init generic remote storage")?;
-    {
-        let _rt_guard = BACKGROUND_RUNTIME.enter();
-        tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
-    };
+
+    let (init_result_sender, init_result_receiver) =
+        std::sync::mpsc::channel::<anyhow::Result<()>>();
+    let storage_for_spawn = remote_storage.clone();
+    let _handler = BACKGROUND_RUNTIME.spawn(async move {
+        let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
+        init_result_sender.send(result)
+    });
+    match init_result_receiver.recv() {
+        Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
+        Err(_sender_dropped_err) => {
+            anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
+        }
+    }
 
     // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
     // bind before launching separate thread so the error reported before startup exits
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 32f96b3c5c..db262598d7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,7 +5,6 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use pageserver_api::models::TenantState;
 use remote_storage::GenericRemoteStorage;
-use tokio::task::JoinError;
 use tracing::*;
 
 use super::models::{
@@ -189,7 +188,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
     match tenant.create_timeline(
         new_timeline_id,
         request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -217,26 +218,30 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
-    let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
+    let response_data = async {
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+            .await
+            .map_err(ApiError::NotFound)?;
+        let timelines = tenant.list_timelines();
 
-    let (tenant_state, timelines) = {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-        (tenant.current_state(), tenant.list_timelines())
-    };
+        let mut response_data = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
+            let timeline_info = build_timeline_info(
+                tenant.current_state(),
+                &timeline,
+                include_non_incremental_logical_size,
+                include_non_incremental_physical_size,
+            )
+            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
+            .map_err(ApiError::InternalServerError)?;
 
-    let mut response_data = Vec::with_capacity(timelines.len());
-    for timeline in timelines {
-        let timeline_info = build_timeline_info(
-            tenant_state,
-            &timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        )
-        .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
-        .map_err(ApiError::InternalServerError)?;
+            response_data.push(timeline_info);
+        }
 
-        response_data.push(timeline_info);
+        Ok(response_data)
     }
+    .instrument(info_span!("timeline_list", tenant = %tenant_id))
+    .await?;
 
     json_response(StatusCode::OK, response_data)
 }
@@ -281,20 +286,16 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     check_permission(&request, Some(tenant_id))?;
 
     let timeline_info = async {
-        let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
-            let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-            Ok((
-                tenant.current_state(),
-                tenant.get_timeline(timeline_id, false),
-            ))
-        })
-        .await
-        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+            .await
+            .map_err(ApiError::NotFound)?;
 
-        let timeline = timeline.map_err(ApiError::NotFound)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(ApiError::NotFound)?;
 
         let timeline_info = build_timeline_info(
-            tenant_state,
+            tenant.current_state(),
             &timeline,
             include_non_incremental_logical_size,
             include_non_incremental_physical_size,
@@ -322,6 +323,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
     let timeline = tenant_mgr::get_tenant(tenant_id, true)
+        .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
     let result = match timeline
@@ -395,20 +397,17 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_list").entered();
-        tenant_mgr::list_tenants()
-            .iter()
-            .map(|(id, state)| TenantInfo {
-                id: *id,
-                state: *state,
-                current_physical_size: None,
-                has_in_progress_downloads: Some(state.has_in_progress_downloads()),
-            })
-            .collect::<Vec<TenantInfo>>()
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    let response_data = tenant_mgr::list_tenants()
+        .instrument(info_span!("tenant_list"))
+        .await
+        .iter()
+        .map(|(id, state)| TenantInfo {
+            id: *id,
+            state: *state,
+            current_physical_size: None,
+            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+        })
+        .collect::<Vec<TenantInfo>>();
 
     json_response(StatusCode::OK, response_data)
 }
@@ -417,9 +416,8 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant_info = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
-        let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
+    let tenant_info = async {
+        let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
 
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
@@ -428,17 +426,15 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         }
 
         let state = tenant.current_state();
-        let tenant_info = TenantInfo {
+        Ok(TenantInfo {
             id: tenant_id,
             state,
             current_physical_size: Some(current_physical_size),
             has_in_progress_downloads: Some(state.has_in_progress_downloads()),
-        };
-
-        Ok::<_, anyhow::Error>(tenant_info)
-    })
+        })
+    }
+    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
     .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
     .map_err(ApiError::InternalServerError)?;
 
     json_response(StatusCode::OK, tenant_info)
@@ -448,7 +444,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
     // this can be long operation, it currently is not backed by any request coalescing or similar
     let inputs = tenant
@@ -565,22 +563,19 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         .map(TenantId::from)
         .unwrap_or_else(TenantId::generate);
 
-    let new_tenant = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
-        let state = get_state(&request);
+    let state = get_state(&request);
 
-        tenant_mgr::create_tenant(
-            state.conf,
-            tenant_conf,
-            target_tenant_id,
-            state.remote_storage.clone(),
-        )
-        // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
-        // with better error handling once the type permits it
-        .map_err(ApiError::InternalServerError)
-    })
+    let new_tenant = tenant_mgr::create_tenant(
+        state.conf,
+        tenant_conf,
+        target_tenant_id,
+        state.remote_storage.clone(),
+    )
+    .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
     .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
+    // with better error handling once the type permits it
+    .map_err(ApiError::InternalServerError)?;
 
     Ok(match new_tenant {
         Some(tenant) => {
@@ -671,17 +666,13 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
         );
     }
 
-    tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
-
-        let state = get_state(&request);
-        tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
-            // FIXME: `update_tenant_config` can fail because of both user and internal errors.
-            // Replace this `map_err` with better error handling once the type permits it
-            .map_err(ApiError::InternalServerError)
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    let state = get_state(&request);
+    tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
+        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
+        .await
+        // FIXME: `update_tenant_config` can fail because of both user and internal errors.
+        // Replace this `map_err` with better error handling once the type permits it
+        .map_err(ApiError::InternalServerError)?;
 
     json_response(StatusCode::OK, ())
 }
@@ -728,7 +719,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
+    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -745,7 +736,9 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
     let timeline = tenant
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
@@ -764,7 +757,9 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
     let timeline = tenant
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fe52124805..036fb14e9b 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -941,7 +941,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
 async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
     match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
         Ok(wait_result) => wait_result
             // no .context(), the error message is good enough and some tests depend on it
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index 70de713a26..bd765dabf8 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -1,13 +1,15 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.
 
-use std::collections::hash_map;
+use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
-use std::fs;
 use std::path::Path;
 use std::sync::Arc;
+use tokio::fs;
 
 use anyhow::Context;
+use once_cell::sync::Lazy;
+use tokio::sync::RwLock;
 use tracing::*;
 
 use remote_storage::GenericRemoteStorage;
@@ -20,86 +22,49 @@ use crate::tenant_config::TenantConfOpt;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
-mod tenants_state {
-    use once_cell::sync::Lazy;
-    use std::{
-        collections::HashMap,
-        sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
-    };
-    use utils::id::TenantId;
-
-    use crate::tenant::Tenant;
-
-    static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
-        Lazy::new(|| RwLock::new(HashMap::new()));
-
-    pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
-        TENANTS
-            .read()
-            .expect("Failed to read() tenants lock, it got poisoned")
-    }
-
-    pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
-        TENANTS
-            .write()
-            .expect("Failed to write() tenants lock, it got poisoned")
-    }
-}
+static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
+    Lazy::new(|| RwLock::new(HashMap::new()));
 
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
-pub fn init_tenant_mgr(
+#[instrument(skip(conf, remote_storage))]
+pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
-    let _entered = info_span!("init_tenant_mgr").entered();
-
     // Scan local filesystem for attached tenants
     let mut number_of_tenants = 0;
     let tenants_dir = conf.tenants_path();
-    for dir_entry in std::fs::read_dir(&tenants_dir)
-        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
-    {
-        match &dir_entry {
-            Ok(dir_entry) => {
+
+    let mut dir_entries = fs::read_dir(&tenants_dir)
+        .await
+        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+
+    loop {
+        match dir_entries.next_entry().await {
+            Ok(None) => break,
+            Ok(Some(dir_entry)) => {
                 let tenant_dir_path = dir_entry.path();
                 if crate::is_temporary(&tenant_dir_path) {
-                    info!(
-                        "Found temporary tenant directory, removing: {}",
-                        tenant_dir_path.display()
-                    );
-                    if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
-                        error!(
-                            "Failed to remove temporary directory '{}': {:?}",
-                            tenant_dir_path.display(),
-                            e
-                        );
+                    info!("Found temporary tenant directory, removing: {tenant_dir_path:?}",);
+                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
+                        error!("Failed to remove temporary directory {tenant_dir_path:?}: {e:?}");
                     }
                 } else {
                     match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
-                        Ok(Some(tenant)) => {
-                            tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
-                            number_of_tenants += 1;
-                        }
-                        Ok(None) => {
-                            // This case happens if we crash during attach before creating the attach marker file
-                            if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
-                                error!(
-                                    "Failed to remove empty tenant directory '{}': {e:#}",
-                                    tenant_dir_path.display()
-                                )
+                            Ok(Some(tenant)) => {
+                                TENANTS.write().await.insert(tenant.tenant_id(), tenant);
+                                number_of_tenants += 1;
                             }
+                            Ok(None) => {
+                                // This case happens if we crash during attach before creating the attach marker file
+                                if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
+                                    error!("Failed to remove empty tenant directory {tenant_dir_path:?}: {e:#}")
+                                }
+                            }
+                            Err(e) => error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}"),
                         }
-                        Err(e) => {
-                            error!(
-                            "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
-                            tenants_dir.display(),
-                            dir_entry,
-                            e
-                        );
-                        }
-                    }
                 }
             }
             Err(e) => {
@@ -107,10 +72,7 @@ pub fn init_tenant_mgr(
                 // here, the pageserver startup fails altogether, causing outage for *all*
                 // tenants. That seems worse.
                 error!(
-                    "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
-                    dir_entry,
-                    tenants_dir.display(),
-                    e,
+                    "Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
                 );
             }
         }
@@ -165,7 +127,7 @@ fn load_local_tenant(
 ///
 pub async fn shutdown_all_tenants() {
     let tenants_to_shut_down = {
-        let mut m = tenants_state::write_tenants();
+        let mut m = TENANTS.write().await;
         let mut tenants_to_shut_down = Vec::with_capacity(m.len());
         for (_, tenant) in m.drain() {
             if tenant.is_active() {
@@ -199,13 +161,13 @@ pub async fn shutdown_all_tenants() {
     }
 }
 
-pub fn create_tenant(
+pub async fn create_tenant(
     conf: &'static PageServerConf,
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<Option<Arc<Tenant>>> {
-    match tenants_state::write_tenants().entry(tenant_id) {
+    match TENANTS.write().await.entry(tenant_id) {
         hash_map::Entry::Occupied(_) => {
             debug!("tenant {tenant_id} already exists");
             Ok(None)
@@ -238,21 +200,23 @@ pub fn create_tenant(
     }
 }
 
-pub fn update_tenant_config(
+pub async fn update_tenant_config(
     conf: &'static PageServerConf,
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
     info!("configuring tenant {tenant_id}");
-    get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
+    get_tenant(tenant_id, true)
+        .await?
+        .update_tenant_config(tenant_conf);
     Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
     Ok(())
 }
 
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
-    let m = tenants_state::read_tenants();
+pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
+    let m = TENANTS.read().await;
     let tenant = m
         .get(&tenant_id)
         .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
@@ -288,7 +252,7 @@ pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> an
     info!("waiting for timeline tasks to shutdown");
     task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
     info!("timeline task shutdown completed");
-    match get_tenant(tenant_id, true) {
+    match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
             tenant.delete_timeline(timeline_id).await?;
         }
@@ -303,7 +267,7 @@ pub async fn detach_tenant(
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
     let tenant = match {
-        let mut tenants_accessor = tenants_state::write_tenants();
+        let mut tenants_accessor = TENANTS.write().await;
         tenants_accessor.remove(&tenant_id)
     } {
         Some(tenant) => tenant,
@@ -321,12 +285,14 @@ pub async fn detach_tenant(
     // we will attempt to remove files which no longer exist. This can be fixed by having shutdown
     // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
     let local_tenant_directory = conf.tenant_path(&tenant_id);
-    fs::remove_dir_all(&local_tenant_directory).with_context(|| {
-        format!(
-            "Failed to remove local tenant directory '{}'",
-            local_tenant_directory.display()
-        )
-    })?;
+    fs::remove_dir_all(&local_tenant_directory)
+        .await
+        .with_context(|| {
+            format!(
+                "Failed to remove local tenant directory '{}'",
+                local_tenant_directory.display()
+            )
+        })?;
 
     Ok(())
 }
@@ -334,8 +300,10 @@ pub async fn detach_tenant(
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
-    tenants_state::read_tenants()
+pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
+    TENANTS
+        .read()
+        .await
         .iter()
         .map(|(id, tenant)| (*id, tenant.current_state()))
         .collect()
@@ -350,7 +318,7 @@ pub async fn attach_tenant(
     tenant_id: TenantId,
     remote_storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
-    match tenants_state::write_tenants().entry(tenant_id) {
+    match TENANTS.write().await.entry(tenant_id) {
         hash_map::Entry::Occupied(e) => {
             // Cannot attach a tenant that already exists. The error message depends on
             // the state it's in.
@@ -378,12 +346,12 @@ use {
 };
 
 #[cfg(feature = "testing")]
-pub fn immediate_gc(
+pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
-    let guard = tenants_state::read_tenants();
+    let guard = TENANTS.read().await;
 
     let tenant = guard
         .get(&tenant_id)
diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs
index d17f0eed43..d3aec933c2 100644
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
     wait: Duration,
 ) -> ControlFlow<(), Arc<Tenant>> {
     let tenant = loop {
-        match tenant_mgr::get_tenant(tenant_id, false) {
+        match tenant_mgr::get_tenant(tenant_id, false).await {
             Ok(tenant) => break tenant,
             Err(e) => {
                 error!("Failed to get a tenant {tenant_id}: {e:#}");

From 7a9cb75e02624f8bfecb49ba94740d5282f5f01f Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 1 Dec 2022 12:35:43 +0200
Subject: [PATCH 010/167] Replace dynamic dispatch with static dispatch

---
 libs/remote_storage/src/lib.rs      | 18 ++++++++++--------
 libs/remote_storage/src/local_fs.rs |  1 +
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 4bdd2b9608..0218fb464d 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -178,21 +178,23 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-pub struct GenericRemoteStorage(Arc<dyn RemoteStorage>);
+pub enum GenericRemoteStorage {
+    LocalFs(LocalFs),
+    AwsS3(Arc<S3Bucket>),
+}
 
 impl Deref for GenericRemoteStorage {
     type Target = dyn RemoteStorage;
 
     fn deref(&self) -> &Self::Target {
-        self.0.as_ref()
+        match self {
+            GenericRemoteStorage::LocalFs(local_fs) => local_fs,
+            GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
+        }
     }
 }
 
 impl GenericRemoteStorage {
-    pub fn new(storage: impl RemoteStorage) -> Self {
-        Self(Arc::new(storage))
-    }
-
     pub fn from_config(
         working_directory: PathBuf,
         storage_config: &RemoteStorageConfig,
@@ -200,12 +202,12 @@ impl GenericRemoteStorage {
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs(root) => {
                 info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?)
+                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone(), working_directory)?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?)
+                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config, working_directory)?))
             }
         })
     }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 2f824cc453..363d47f38d 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -33,6 +33,7 @@ fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
     ))
 }
 
+#[derive(Debug, Clone)]
 pub struct LocalFs {
     working_directory: PathBuf,
     storage_root: PathBuf,

From b38473d3670eed625ba4e29d82601183a8c0c07d Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 1 Dec 2022 16:44:14 +0200
Subject: [PATCH 011/167] Remove RelativePath conversions

Function was unused, but publicly exported from the module lib,
so not reported by rustc as unused
---
 pageserver/src/storage_sync2.rs       |  4 +--
 pageserver/src/storage_sync2/index.rs | 49 +++++++++++++--------------
 pageserver/src/tenant/timeline.rs     |  8 ++---
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 94216747ef..44b82a6a8c 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -612,7 +612,7 @@ impl RemoteTimelineClient {
             "file size not initialized in metadata"
         );
 
-        let relative_path = RelativePath::from_local_path(
+        let relative_path = RelativePath::strip_base_path(
             &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
             path,
         )?;
@@ -644,7 +644,7 @@ impl RemoteTimelineClient {
         // Convert the paths into RelativePaths, and gather other information we need.
         let mut relative_paths = Vec::with_capacity(paths.len());
         for path in paths {
-            relative_paths.push(RelativePath::from_local_path(
+            relative_paths.push(RelativePath::strip_base_path(
                 &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
                 path,
             )?);
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index 2d5f3b1d54..99dd2eae6a 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -18,27 +18,26 @@ use utils::lsn::Lsn;
 /// A part of the filesystem path, that needs a root to become a path again.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct RelativePath(String);
+pub struct RelativePath(PathBuf);
 
 impl RelativePath {
-    /// Attempts to strip off the base from path, producing a relative path or an error.
-    pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
-        let relative = path.strip_prefix(timeline_path).with_context(|| {
-            format!(
-                "path '{}' is not relative to base '{}'",
-                path.display(),
-                timeline_path.display()
-            )
+    pub fn new(relative_path: &Path) -> Self {
+        debug_assert!(
+            relative_path.is_relative(),
+            "Path {relative_path:?} is not relative"
+        );
+        Self(relative_path.to_path_buf())
+    }
+
+    pub fn strip_base_path(base_path: &Path, full_path: &Path) -> anyhow::Result<Self> {
+        let relative = full_path.strip_prefix(base_path).with_context(|| {
+            format!("path {full_path:?} is not relative to base {base_path:?}",)
         })?;
-        Ok(Self::from_filename(relative))
+        Ok(Self::new(relative))
     }
 
-    pub fn from_filename(path: &Path) -> RelativePath {
-        RelativePath(path.to_string_lossy().to_string())
-    }
-
-    pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
-        timeline_path.join(&self.0)
+    pub fn to_local_path(&self, base_path: &Path) -> PathBuf {
+        base_path.join(&self.0)
     }
 }
 
@@ -199,8 +198,8 @@ mod tests {
 
         let expected = IndexPart {
             version: 0,
-            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
-            missing_layers: Some([RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect()),
+            timeline_layers: HashSet::from([RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
+            missing_layers: Some(HashSet::from([RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -227,13 +226,13 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
-            missing_layers: Some([RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect()),
+            timeline_layers: HashSet::from([RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
+            missing_layers: Some(HashSet::from([RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
             layer_metadata: HashMap::from([
-                (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
+                (RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
                     file_size: Some(25600000),
                 }),
-                (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
+                (RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: Some(9007199254741001),
@@ -263,12 +262,12 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
+            timeline_layers: [RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))].into_iter().collect(),
             layer_metadata: HashMap::from([
-                (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
+                (RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
                     file_size: Some(25600000),
                 }),
-                (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
+                (RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: Some(9007199254741001),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 06ddff05ea..e42e887524 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1013,7 +1013,7 @@ impl Timeline {
         local_filenames.retain(|path| {
             let layer_metadata = index_part
                 .layer_metadata
-                .get(&RelativePath::from_filename(path))
+                .get(&RelativePath::new(path))
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
@@ -1062,7 +1062,7 @@ impl Timeline {
 
             let layer_metadata = index_part
                 .layer_metadata
-                .get(&RelativePath::from_filename(path))
+                .get(&RelativePath::new(path))
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
@@ -1077,7 +1077,7 @@ impl Timeline {
 
                 trace!("downloading image file: {}", path.display());
                 let sz = remote_client
-                    .download_layer_file(&RelativePath::from_filename(path), &layer_metadata)
+                    .download_layer_file(&RelativePath::new(path), &layer_metadata)
                     .await
                     .context("download image layer")?;
                 trace!("done");
@@ -1107,7 +1107,7 @@ impl Timeline {
 
                 trace!("downloading delta file: {}", path.display());
                 let sz = remote_client
-                    .download_layer_file(&RelativePath::from_filename(path), &layer_metadata)
+                    .download_layer_file(&RelativePath::new(path), &layer_metadata)
                     .await
                     .context("download delta layer")?;
                 trace!("done");

From c0480facc104cf6fc699d27dc3fd6cde47e1df89 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 5 Dec 2022 11:41:12 +0100
Subject: [PATCH 012/167] Rename RelativePath to RemotePath

Improve rustdocs a bit
---
 pageserver/src/storage_sync2.rs          | 12 +++----
 pageserver/src/storage_sync2/download.rs |  4 +--
 pageserver/src/storage_sync2/index.rs    | 46 ++++++++++++------------
 pageserver/src/tenant/timeline.rs        | 10 +++---
 4 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 44b82a6a8c..5b3225028f 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -217,7 +217,7 @@ use crate::metrics::RemoteOpKind;
 use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
 use crate::{
     config::PageServerConf,
-    storage_sync::index::{LayerFileMetadata, RelativePath},
+    storage_sync::index::{LayerFileMetadata, RemotePath},
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
@@ -287,7 +287,7 @@ struct UploadQueueInitialized {
 
     /// All layer files stored in the remote storage, taking into account all
     /// in-progress and queued operations
-    latest_files: HashMap<RelativePath, LayerFileMetadata>,
+    latest_files: HashMap<RemotePath, LayerFileMetadata>,
 
     /// Metadata stored in the remote storage, taking into account all
     /// in-progress and queued operations.
@@ -510,7 +510,7 @@ impl RemoteTimelineClient {
     /// On success, returns the size of the downloaded file.
     pub async fn download_layer_file(
         &self,
-        path: &RelativePath,
+        path: &RemotePath,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
         let downloaded_size = download::download_layer_file(
@@ -612,7 +612,7 @@ impl RemoteTimelineClient {
             "file size not initialized in metadata"
         );
 
-        let relative_path = RelativePath::strip_base_path(
+        let relative_path = RemotePath::strip_base_path(
             &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
             path,
         )?;
@@ -644,7 +644,7 @@ impl RemoteTimelineClient {
         // Convert the paths into RelativePaths, and gather other information we need.
         let mut relative_paths = Vec::with_capacity(paths.len());
         for path in paths {
-            relative_paths.push(RelativePath::strip_base_path(
+            relative_paths.push(RemotePath::strip_base_path(
                 &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
                 path,
             )?);
@@ -1093,7 +1093,7 @@ mod tests {
         TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
     }
 
-    fn assert_file_list(a: &HashSet<RelativePath>, b: &[&str]) {
+    fn assert_file_list(a: &HashSet<RemotePath>, b: &[&str]) {
         let xx = PathBuf::from("");
         let mut avec: Vec<String> = a
             .iter()
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index 12b858fb57..d68455ea2b 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -15,7 +15,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
 use super::index::IndexPart;
-use super::RelativePath;
+use super::RemotePath;
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -31,7 +31,7 @@ pub async fn download_layer_file<'a>(
     storage: &'a GenericRemoteStorage,
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    path: &'a RelativePath,
+    path: &'a RemotePath,
     layer_metadata: &'a LayerFileMetadata,
 ) -> anyhow::Result<u64> {
     let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index 99dd2eae6a..a1da37b826 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -15,12 +15,14 @@ use crate::tenant::metadata::TimelineMetadata;
 
 use utils::lsn::Lsn;
 
-/// A part of the filesystem path, that needs a root to become a path again.
+/// Path on the remote storage, relative to some inner prefix.
+/// The prefix is an implementation detail, that allows representing local paths
+/// as the remote ones, stripping the local storage prefix away.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct RelativePath(PathBuf);
+pub struct RemotePath(PathBuf);
 
-impl RelativePath {
+impl RemotePath {
     pub fn new(relative_path: &Path) -> Self {
         debug_assert!(
             relative_path.is_relative(),
@@ -96,22 +98,22 @@ pub struct IndexPart {
     #[serde(default)]
     version: usize,
 
-    /// Each of the layers present on remote storage.
+    /// Layer names, which are stored on the remote storage.
     ///
     /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<RelativePath>,
+    pub timeline_layers: HashSet<RemotePath>,
 
     /// FIXME: unused field. This should be removed, but that changes the on-disk format,
     /// so we need to make sure we're backwards- (and maybe forwards-) compatible
     /// First pass is to move it to Optional and the next would be its removal
-    missing_layers: Option<HashSet<RelativePath>>,
+    missing_layers: Option<HashSet<RemotePath>>,
 
-    /// Per layer file metadata, which can be present for a present or missing layer file.
+    /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
     #[serde(default)]
-    pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<RemotePath, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated here for convenience.
@@ -129,7 +131,7 @@ impl IndexPart {
     pub const FILE_NAME: &'static str = "index_part.json";
 
     pub fn new(
-        layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
+        layers_and_metadata: HashMap<RemotePath, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata_bytes: Vec<u8>,
     ) -> Self {
@@ -172,9 +174,9 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
 }
 
 fn separate_paths_and_metadata(
-    input: &HashMap<RelativePath, LayerFileMetadata>,
-    output: &mut HashSet<RelativePath>,
-    layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
+    input: &HashMap<RemotePath, LayerFileMetadata>,
+    output: &mut HashSet<RemotePath>,
+    layer_metadata: &mut HashMap<RemotePath, IndexLayerMetadata>,
 ) {
     for (path, metadata) in input {
         let metadata = IndexLayerMetadata::from(metadata);
@@ -198,8 +200,8 @@ mod tests {
 
         let expected = IndexPart {
             version: 0,
-            timeline_layers: HashSet::from([RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
-            missing_layers: Some(HashSet::from([RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
+            timeline_layers: HashSet::from([RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
+            missing_layers: Some(HashSet::from([RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -226,13 +228,13 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: HashSet::from([RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
-            missing_layers: Some(HashSet::from([RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
+            timeline_layers: HashSet::from([RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
+            missing_layers: Some(HashSet::from([RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
             layer_metadata: HashMap::from([
-                (RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
+                (RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
                     file_size: Some(25600000),
                 }),
-                (RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
+                (RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: Some(9007199254741001),
@@ -262,12 +264,12 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: [RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))].into_iter().collect(),
+            timeline_layers: [RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))].into_iter().collect(),
             layer_metadata: HashMap::from([
-                (RelativePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
+                (RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
                     file_size: Some(25600000),
                 }),
-                (RelativePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
+                (RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: Some(9007199254741001),
@@ -275,7 +277,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            missing_layers: None::<HashSet<RelativePath>>,
+            missing_layers: None::<HashSet<RemotePath>>,
         };
 
         let part = serde_json::from_str::<IndexPart>(example).unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e42e887524..1bf967c4bf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,7 +19,7 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::storage_sync::index::{IndexPart, RelativePath};
+use crate::storage_sync::index::{IndexPart, RemotePath};
 use crate::storage_sync::RemoteTimelineClient;
 use crate::tenant::{
     delta_layer::{DeltaLayer, DeltaLayerWriter},
@@ -1013,7 +1013,7 @@ impl Timeline {
         local_filenames.retain(|path| {
             let layer_metadata = index_part
                 .layer_metadata
-                .get(&RelativePath::new(path))
+                .get(&RemotePath::new(path))
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
@@ -1062,7 +1062,7 @@ impl Timeline {
 
             let layer_metadata = index_part
                 .layer_metadata
-                .get(&RelativePath::new(path))
+                .get(&RemotePath::new(path))
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
@@ -1077,7 +1077,7 @@ impl Timeline {
 
                 trace!("downloading image file: {}", path.display());
                 let sz = remote_client
-                    .download_layer_file(&RelativePath::new(path), &layer_metadata)
+                    .download_layer_file(&RemotePath::new(path), &layer_metadata)
                     .await
                     .context("download image layer")?;
                 trace!("done");
@@ -1107,7 +1107,7 @@ impl Timeline {
 
                 trace!("downloading delta file: {}", path.display());
                 let sz = remote_client
-                    .download_layer_file(&RelativePath::new(path), &layer_metadata)
+                    .download_layer_file(&RemotePath::new(path), &layer_metadata)
                     .await
                     .context("download delta layer")?;
                 trace!("done");

From 61825dfb57037acf29d08f6d6181375beeea1aa2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 5 Dec 2022 08:35:17 +0000
Subject: [PATCH 013/167] Update chrono to 0.4.23; use only clock feature from
 it

---
 Cargo.lock               | 48 ++++++++++++----------------------------
 compute_tools/Cargo.toml |  2 +-
 pageserver/Cargo.toml    |  2 +-
 3 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c9acb882eb..3bcd189d9e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -85,7 +85,7 @@ dependencies = [
  "num-traits",
  "rusticata-macros",
  "thiserror",
- "time 0.3.15",
+ "time",
 ]
 
 [[package]]
@@ -190,7 +190,7 @@ dependencies = [
  "http",
  "hyper",
  "ring",
- "time 0.3.15",
+ "time",
  "tokio",
  "tower",
  "tracing",
@@ -331,7 +331,7 @@ dependencies = [
  "percent-encoding",
  "regex",
  "ring",
- "time 0.3.15",
+ "time",
  "tracing",
 ]
 
@@ -468,7 +468,7 @@ dependencies = [
  "itoa",
  "num-integer",
  "ryu",
- "time 0.3.15",
+ "time",
 ]
 
 [[package]]
@@ -713,17 +713,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.22"
+version = "0.4.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1"
+checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
 dependencies = [
  "iana-time-zone",
- "js-sys",
  "num-integer",
  "num-traits",
  "serde",
- "time 0.1.44",
- "wasm-bindgen",
  "winapi",
 ]
 
@@ -1538,7 +1535,7 @@ checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
 dependencies = [
  "cfg-if",
  "libc",
- "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasi",
 ]
 
 [[package]]
@@ -2104,7 +2101,7 @@ checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf"
 dependencies = [
  "libc",
  "log",
- "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasi",
  "windows-sys",
 ]
 
@@ -2985,7 +2982,7 @@ checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b"
 dependencies = [
  "pem",
  "ring",
- "time 0.3.15",
+ "time",
  "yasna",
 ]
 
@@ -3478,7 +3475,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with_macros",
- "time 0.3.15",
+ "time",
 ]
 
 [[package]]
@@ -3569,7 +3566,7 @@ dependencies = [
  "num-bigint",
  "num-traits",
  "thiserror",
- "time 0.3.15",
+ "time",
 ]
 
 [[package]]
@@ -3831,17 +3828,6 @@ dependencies = [
  "once_cell",
 ]
 
-[[package]]
-name = "time"
-version = "0.1.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
-dependencies = [
- "libc",
- "wasi 0.10.0+wasi-snapshot-preview1",
- "winapi",
-]
-
 [[package]]
 name = "time"
 version = "0.3.15"
@@ -4439,12 +4425,6 @@ dependencies = [
  "try-lock",
 ]
 
-[[package]]
-name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
-
 [[package]]
 name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
@@ -4673,7 +4653,7 @@ dependencies = [
  "serde",
  "stable_deref_trait",
  "syn",
- "time 0.3.15",
+ "time",
  "tokio",
  "tokio-util",
  "tower",
@@ -4696,7 +4676,7 @@ dependencies = [
  "oid-registry",
  "rusticata-macros",
  "thiserror",
- "time 0.3.15",
+ "time",
 ]
 
 [[package]]
@@ -4720,7 +4700,7 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "346d34a236c9d3e5f3b9b74563f238f955bbd05fa0b8b4efa53c130c43982f4c"
 dependencies = [
- "time 0.3.15",
+ "time",
 ]
 
 [[package]]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index d6f8fae34c..a35cef197d 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 
 [dependencies]
 anyhow = "1.0"
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = "4.0"
 env_logger = "0.9"
 futures = "0.3.13"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 61c7b8ae97..43d51f90c1 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -18,7 +18,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-chrono = "0.4.19"
+chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["string"] }
 close_fds = "0.3.2"
 const_format = "0.2.21"

From 046ba67d68c52a7f43863f64fc249e5762e55772 Mon Sep 17 00:00:00 2001
From: danieltprice <10074684+danieltprice@users.noreply.github.com>
Date: Tue, 6 Dec 2022 11:27:46 -0400
Subject: [PATCH 014/167] Update README.md (#3015)

Update readme to remove reference to the invite gate.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1b8c28518e..c31bac6446 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
 
 ## Quick start
-[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
+Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
 
 Alternatively, compile and run the project [locally](#running-local-installation).
 

From d6bfe955c65a0cdf66475a6c52350b365a043c99 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 6 Dec 2022 16:30:02 +0100
Subject: [PATCH 015/167] Add commands to unload and load the tenant in memory
 (#2977)

Closes https://github.com/neondatabase/neon/issues/2537

Follow-up of https://github.com/neondatabase/neon/pull/2950
With the new model that prevents attaching without the remote storage,
it has started to be even more odd to add attach-with-files
functionality (in addition to the issues raised previously).

Adds two separate commands:
* `POST {tenant_id}/ignore` that places a mark file to skip such tenant
on every start and removes it from memory
* `POST {tenant_id}/schedule_load` that tries to load a tenant from
local FS similar to what pageserver does now on startup, but without
directory removals
---
 pageserver/src/config.rs                   |   8 +-
 pageserver/src/http/openapi_spec.yml       |  91 +++++-
 pageserver/src/http/routes.rs              |  35 ++-
 pageserver/src/lib.rs                      |   7 +
 pageserver/src/tenant.rs                   |   4 +-
 pageserver/src/tenant/timeline.rs          |   4 +-
 pageserver/src/tenant_mgr.rs               | 301 ++++++++++++------
 test_runner/fixtures/neon_fixtures.py      |   8 +
 test_runner/regress/test_remote_storage.py |  16 +-
 test_runner/regress/test_tenant_detach.py  | 335 +++++++++++++++++++++
 10 files changed, 703 insertions(+), 106 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1ac07f6ebc..b3eab6c3cb 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -27,7 +27,9 @@ use utils::{
 
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::tenant_config::{TenantConf, TenantConfOpt};
-use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
+use crate::{
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
+};
 
 pub mod defaults {
     use crate::tenant_config::defaults::*;
@@ -402,6 +404,10 @@ impl PageServerConf {
             .join(TENANT_ATTACHING_MARKER_FILENAME)
     }
 
+    pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
+        self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
+    }
+
     /// Points to a place in pageserver's local directory,
     /// where certain tenant's tenantconf file should be located.
     pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b8f467cd02..932cda50b7 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -274,6 +274,7 @@ paths:
         schema:
           type: string
           format: hex
+
     post:
       description: Schedules attach operation to happen in the background for given tenant
       responses:
@@ -325,7 +326,9 @@ paths:
           type: string
           format: hex
     post:
-      description: Detach local tenant
+      description: |
+        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
+        Files on the remote storage are not affected.
       responses:
         "200":
           description: Tenant detached
@@ -354,6 +357,92 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
 
+  /v1/tenant/{tenant_id}/ignore:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: |
+        Remove tenant data (including all corresponding timelines) from pageserver's memory.
+        Files on local disk and remote storage are not affected.
+
+        Future pageserver restarts won't load the data back until `load` is called on such tenant.
+      responses:
+        "200":
+          description: Tenant ignored
+        "400":
+          description: Error when no tenant id found in path parameters
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+  /v1/tenant/{tenant_id}/load:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: |
+        Schedules an operation that attempts to load a tenant from the local disk and
+        synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
+        If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
+
+        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
+        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully
+        "400":
+          description: Error when no tenant id found in path parameters
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
   /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index db262598d7..d1fdf26a5a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -349,13 +349,13 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     if let Some(remote_storage) = &state.remote_storage {
         // FIXME: distinguish between "Tenant already exists" and other errors
-        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
+        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
             .map_err(ApiError::InternalServerError)?;
     } else {
         return Err(ApiError::BadRequest(anyhow!(
-            "attach_tenant is possible because pageserver was configured without remote storage"
+            "attach_tenant is not possible because pageserver was configured without remote storage"
         )));
     }
 
@@ -394,6 +394,35 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, ())
 }
 
+async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+    tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+        .instrument(info_span!("load", tenant = %tenant_id))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+    let conf = state.conf;
+    tenant_mgr::ignore_tenant(conf, tenant_id)
+        .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
+        .await
+        // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
+        // Replace this with better handling once the error type permits it.
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -833,6 +862,8 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
         .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
         .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
+        .post("/v1/tenant/:tenant_id/load", tenant_load_handler)
+        .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id",
             timeline_detail_handler,
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5147bd26bb..eafcaa88d9 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -125,6 +125,13 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 
+/// A marker file to prevent pageserver from loading a certain tenant on restart.
+/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
+/// `ignore` management API command, that expects the ignored tenant to be properly loaded
+/// into pageserver's memory before being ignored.
+/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
+pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
+
 pub fn is_temporary(path: &Path) -> bool {
     match path.file_name() {
         Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 981c049111..87f92402b1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -571,7 +571,7 @@ impl Tenant {
     pub fn spawn_attach(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
-        remote_storage: &GenericRemoteStorage,
+        remote_storage: GenericRemoteStorage,
     ) -> Arc<Tenant> {
         // XXX: Attach should provide the config, especially during tenant migration.
         //      See https://github.com/neondatabase/neon/issues/1555
@@ -584,7 +584,7 @@ impl Tenant {
             tenant_conf,
             wal_redo_manager,
             tenant_id,
-            Some(remote_storage.clone()),
+            Some(remote_storage),
         ));
 
         // Do all the hard work in the background
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1bf967c4bf..4011156ec5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1075,7 +1075,7 @@ impl Timeline {
                     continue;
                 }
 
-                trace!("downloading image file: {}", path.display());
+                trace!("downloading image file: {path:?}");
                 let sz = remote_client
                     .download_layer_file(&RemotePath::new(path), &layer_metadata)
                     .await
@@ -1105,7 +1105,7 @@ impl Timeline {
                     continue;
                 }
 
-                trace!("downloading delta file: {}", path.display());
+                trace!("downloading delta file: {path:?}");
                 let sz = remote_client
                     .download_layer_file(&RemotePath::new(path), &layer_metadata)
                     .await
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index bd765dabf8..f4f1eba717 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -13,11 +13,13 @@ use tokio::sync::RwLock;
 use tracing::*;
 
 use remote_storage::GenericRemoteStorage;
+use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::{Tenant, TenantState};
 use crate::tenant_config::TenantConfOpt;
+use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
@@ -47,24 +49,52 @@ pub async fn init_tenant_mgr(
             Ok(Some(dir_entry)) => {
                 let tenant_dir_path = dir_entry.path();
                 if crate::is_temporary(&tenant_dir_path) {
-                    info!("Found temporary tenant directory, removing: {tenant_dir_path:?}",);
+                    info!(
+                        "Found temporary tenant directory, removing: {}",
+                        tenant_dir_path.display()
+                    );
                     if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
-                        error!("Failed to remove temporary directory {tenant_dir_path:?}: {e:?}");
+                        error!(
+                            "Failed to remove temporary directory '{}': {:?}",
+                            tenant_dir_path.display(),
+                            e
+                        );
                     }
                 } else {
-                    match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
-                            Ok(Some(tenant)) => {
-                                TENANTS.write().await.insert(tenant.tenant_id(), tenant);
-                                number_of_tenants += 1;
-                            }
-                            Ok(None) => {
-                                // This case happens if we crash during attach before creating the attach marker file
-                                if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
-                                    error!("Failed to remove empty tenant directory {tenant_dir_path:?}: {e:#}")
-                                }
-                            }
-                            Err(e) => error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}"),
+                    // This case happens if we crash during attach before creating the attach marker file
+                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
+                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
+                    })?;
+                    if is_empty {
+                        info!("removing empty tenant directory {tenant_dir_path:?}");
+                        if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
+                            error!(
+                                "Failed to remove empty tenant directory '{}': {e:#}",
+                                tenant_dir_path.display()
+                            )
                         }
+                        continue;
+                    }
+
+                    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+                    if tenant_ignore_mark_file.exists() {
+                        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+                        continue;
+                    }
+
+                    match schedule_local_tenant_processing(
+                        conf,
+                        &tenant_dir_path,
+                        remote_storage.clone(),
+                    ) {
+                        Ok(tenant) => {
+                            TENANTS.write().await.insert(tenant.tenant_id(), tenant);
+                            number_of_tenants += 1;
+                        }
+                        Err(e) => {
+                            error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
+                        }
+                    }
                 }
             }
             Err(e) => {
@@ -82,34 +112,45 @@ pub async fn init_tenant_mgr(
     Ok(())
 }
 
-fn load_local_tenant(
+pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<Option<Arc<Tenant>>> {
-    if !tenant_path.is_dir() {
-        anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
-    }
-
-    let is_empty = tenant_path
-        .is_empty_dir()
-        .context("check whether tenant_path is an empty dir")?;
-    if is_empty {
-        info!("skipping empty tenant directory {tenant_path:?}");
-        return Ok(None);
-    }
+) -> anyhow::Result<Arc<Tenant>> {
+    anyhow::ensure!(
+        tenant_path.is_dir(),
+        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
+    );
+    anyhow::ensure!(
+        !crate::is_temporary(tenant_path),
+        "Cannot load tenant from temporary path {tenant_path:?}"
+    );
+    anyhow::ensure!(
+        !tenant_path.is_empty_dir().with_context(|| {
+            format!("Failed to check whether {tenant_path:?} is an empty dir")
+        })?,
+        "Cannot load tenant from empty directory {tenant_path:?}"
+    );
 
     let tenant_id = tenant_path
         .file_name()
         .and_then(OsStr::to_str)
         .unwrap_or_default()
         .parse::<TenantId>()
-        .context("Could not parse tenant id out of the tenant dir name")?;
+        .with_context(|| {
+            format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
+        })?;
+
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
+    anyhow::ensure!(
+        !conf.tenant_ignore_mark_file_path(tenant_id).exists(),
+        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
+    );
 
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, &remote_storage)
+            Tenant::spawn_attach(conf, tenant_id, remote_storage)
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(conf, tenant_id)
@@ -119,7 +160,7 @@ fn load_local_tenant(
         // Start loading the tenant into memory. It will initially be in Loading state.
         Tenant::spawn_load(conf, tenant_id, remote_storage)
     };
-    Ok(Some(tenant))
+    Ok(tenant)
 }
 
 ///
@@ -177,25 +218,15 @@ pub async fn create_tenant(
             // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
             let tenant_directory =
                 super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
-            let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
-            match created_tenant {
-                None => {
-                    // We get None in case the directory is empty.
-                    // This shouldn't happen here, because we just created the directory.
-                    // So, skip any cleanup work for now, we don't know how we reached this state.
-                    anyhow::bail!("we just created the tenant directory, it can't be empty");
-                }
-                Some(tenant) => {
-                    anyhow::ensure!(
-                        tenant_id == tenant.tenant_id(),
-                        "loaded created tenant has unexpected tenant id (expect {} != actual {})",
-                        tenant_id,
-                        tenant.tenant_id()
-                    );
-                    v.insert(Arc::clone(&tenant));
-                    Ok(Some(tenant))
-                }
-            }
+            let created_tenant =
+                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
+            let crated_tenant_id = created_tenant.tenant_id();
+            anyhow::ensure!(
+                tenant_id == crated_tenant_id,
+                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
+            );
+            v.insert(Arc::clone(&created_tenant));
+            Ok(Some(created_tenant))
         }
     }
 }
@@ -266,35 +297,58 @@ pub async fn detach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
-    let tenant = match {
-        let mut tenants_accessor = TENANTS.write().await;
-        tenants_accessor.remove(&tenant_id)
-    } {
-        Some(tenant) => tenant,
-        None => anyhow::bail!("Tenant not found for id {tenant_id}"),
-    };
+    remove_tenant_from_memory(tenant_id, async {
+        let local_tenant_directory = conf.tenant_path(&tenant_id);
+        fs::remove_dir_all(&local_tenant_directory)
+            .await
+            .with_context(|| {
+                format!("Failed to remove local tenant directory {local_tenant_directory:?}")
+            })?;
+        Ok(())
+    })
+    .await
+}
 
-    tenant.set_stopping();
-    // shutdown all tenant and timeline tasks: gc, compaction, page service)
-    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
+pub async fn load_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+    remote_storage: Option<GenericRemoteStorage>,
+) -> anyhow::Result<()> {
+    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+        let tenant_path = conf.tenant_path(&tenant_id);
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
+        if tenant_ignore_mark.exists() {
+            std::fs::remove_file(&tenant_ignore_mark)
+                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
+        }
 
-    // If removal fails there will be no way to successfully retry detach,
-    // because the tenant no longer exists in the in-memory map. And it needs to be removed from it
-    // before we remove files, because it contains references to tenant
-    // which references ephemeral files which are deleted on drop. So if we keep these references,
-    // we will attempt to remove files which no longer exist. This can be fixed by having shutdown
-    // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
-    let local_tenant_directory = conf.tenant_path(&tenant_id);
-    fs::remove_dir_all(&local_tenant_directory)
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to remove local tenant directory '{}'",
-                local_tenant_directory.display()
-            )
-        })?;
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
+            .with_context(|| {
+                format!("Failed to schedule tenant processing in path {tenant_path:?}")
+            })?;
 
-    Ok(())
+        vacant_entry.insert(new_tenant);
+        Ok(())
+    }).await
+}
+
+pub async fn ignore_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+) -> anyhow::Result<()> {
+    remove_tenant_from_memory(tenant_id, async {
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
+        fs::File::create(&ignore_mark_file)
+            .await
+            .context("Failed to create ignore mark file")
+            .and_then(|_| {
+                crashsafe::fsync_file_and_parent(&ignore_mark_file)
+                    .context("Failed to fsync ignore mark file")
+            })
+            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
+        Ok(())
+    })
+    .await
 }
 
 ///
@@ -316,25 +370,92 @@ pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
 pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
-    remote_storage: &GenericRemoteStorage,
+    remote_storage: GenericRemoteStorage,
 ) -> anyhow::Result<()> {
+    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+        let tenant_path = conf.tenant_path(&tenant_id);
+        anyhow::ensure!(
+            !tenant_path.exists(),
+            "Cannot attach tenant {tenant_id}, local tenant directory already exists"
+        );
+
+        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
+        vacant_entry.insert(tenant);
+
+        Ok(())
+    })
+    .await
+}
+
+async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
+where
+    F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
+{
     match TENANTS.write().await.entry(tenant_id) {
         hash_map::Entry::Occupied(e) => {
-            // Cannot attach a tenant that already exists. The error message depends on
-            // the state it's in.
-            match e.get().current_state() {
-                TenantState::Attaching => {
-                    anyhow::bail!("tenant {tenant_id} attach is already in progress")
-                }
-                current_state => {
-                    anyhow::bail!("tenant already exists, current state: {current_state:?}")
-                }
-            }
+            anyhow::bail!(
+                "tenant {tenant_id} already exists, state: {:?}",
+                e.get().current_state()
+            )
         }
-        hash_map::Entry::Vacant(v) => {
-            let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
-            v.insert(tenant);
-            Ok(())
+        hash_map::Entry::Vacant(v) => run(v),
+    }
+}
+
+/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
+/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
+/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
+/// operation would be needed to remove it.
+async fn remove_tenant_from_memory<V, F>(
+    tenant_id: TenantId,
+    tenant_cleanup: F,
+) -> anyhow::Result<V>
+where
+    F: std::future::Future<Output = anyhow::Result<V>>,
+{
+    // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
+    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
+    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
+    // avoid holding the lock for the entire process.
+    {
+        let tenants_accessor = TENANTS.write().await;
+        match tenants_accessor.get(&tenant_id) {
+            Some(tenant) => match tenant.current_state() {
+                TenantState::Attaching
+                | TenantState::Loading
+                | TenantState::Broken
+                | TenantState::Active => tenant.set_stopping(),
+                TenantState::Stopping => {
+                    anyhow::bail!("Tenant {tenant_id} is stopping already")
+                }
+            },
+            None => anyhow::bail!("Tenant not found for id {tenant_id}"),
+        }
+    }
+
+    // shutdown all tenant and timeline tasks: gc, compaction, page service)
+    // No new tasks will be started for this tenant because it's in `Stopping` state.
+    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
+    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
+
+    match tenant_cleanup
+        .await
+        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
+    {
+        Ok(hook_value) => {
+            let mut tenants_accessor = TENANTS.write().await;
+            if tenants_accessor.remove(&tenant_id).is_none() {
+                warn!("Tenant {tenant_id} got removed from memory before operation finished");
+            }
+            Ok(hook_value)
+        }
+        Err(e) => {
+            let tenants_accessor = TENANTS.read().await;
+            match tenants_accessor.get(&tenant_id) {
+                Some(tenant) => tenant.set_broken(),
+                None => warn!("Tenant {tenant_id} got removed from memory"),
+            }
+            Err(e)
         }
     }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7fc2a7c24b..6fae448794 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1119,6 +1119,14 @@ class PageserverHttpClient(requests.Session):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
         self.verbose_error(res)
 
+    def tenant_load(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
+        self.verbose_error(res)
+
+    def tenant_ignore(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
+        self.verbose_error(res)
+
     def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
         res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 550ad43fc9..d1b23123b5 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -71,8 +71,10 @@ def test_remote_storage_backup_and_restore(
     # FIXME retry downloads without throwing errors
     env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
     # we have a bunch of pytest.raises for these below
-    env.pageserver.allowed_errors.append(".*tenant already exists.*")
-    env.pageserver.allowed_errors.append(".*attach is already in progress.*")
+    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
+    env.pageserver.allowed_errors.append(
+        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
+    )
 
     pageserver_http = env.pageserver.http_client()
     pg = env.postgres.create_start("main")
@@ -136,7 +138,7 @@ def test_remote_storage_backup_and_restore(
 
     # assert cannot attach timeline that is scheduled for download
     # FIXME implement layer download retries
-    with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
+    with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
         client.tenant_attach(tenant_id)
 
     tenant_status = client.tenant_status(tenant_id)
@@ -149,9 +151,7 @@ def test_remote_storage_backup_and_restore(
     env.pageserver.start()
 
     # ensure that an initiated attach operation survives pageserver restart
-    with pytest.raises(
-        Exception, match=r".*(tenant already exists|attach is already in progress).*"
-    ):
+    with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
         client.tenant_attach(tenant_id)
     log.info("waiting for timeline redownload")
     wait_until(
@@ -191,7 +191,7 @@ def test_remote_storage_upload_queue_retries(
 
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_storage_backup_and_restore",
+        test_name="test_remote_storage_upload_queue_retries",
     )
 
     env = neon_env_builder.init_start()
@@ -353,7 +353,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
 ):
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_storage_backup_and_restore",
+        test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
     )
 
     env = neon_env_builder.init_start()
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index bafddc7721..0d3465cc01 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -7,6 +7,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PageserverApiException,
     PageserverHttpClient,
+    Postgres,
     RemoteStorageKind,
     available_remote_storages,
     wait_for_last_record_lsn,
@@ -167,3 +168,337 @@ def test_detach_while_attaching(
 
     with pg.cursor() as cur:
         cur.execute("SELECT COUNT(*) FROM foo")
+
+
+# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
+# * writes some data into tenant's timeline
+# * ensures it's synced with the remote storage
+# * `ignore` the tenant
+# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
+# * verify the ignored tenant is gone from pageserver's memory
+# * restart the pageserver and verify that ignored tenant is still not loaded
+# * `load` the same tenant
+# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
+def test_ignored_tenant_reattach(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_storage_backup_and_restore",
+    )
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    ignored_tenant_id, _ = env.neon_cli.create_tenant()
+    tenant_dir = env.repo_dir / "tenants" / str(ignored_tenant_id)
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_before_ignore.sort()
+    timelines_before_ignore = [
+        timeline["timeline_id"]
+        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
+    ]
+    files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
+
+    # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
+    pageserver_http.tenant_ignore(ignored_tenant_id)
+
+    files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
+    new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
+    disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
+    assert (
+        len(disappeared_files) == 0
+    ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
+    assert (
+        len(new_files) == 1
+    ), f"Only tenant ignore file should appear on disk but got: {new_files}"
+
+    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
+    assert len(tenants_after_ignore) + 1 == len(
+        tenants_before_ignore
+    ), "Only ignored tenant should be missing"
+
+    # restart the pageserver to ensure we don't load the ignore timeline
+    env.pageserver.stop()
+    env.pageserver.start()
+    tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_restart.sort()
+    assert (
+        tenants_after_restart == tenants_after_ignore
+    ), "Ignored tenant should not be reloaded after pageserver restart"
+
+    # now, load it from the local files and expect it works
+    pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
+    wait_until_tenant_status(pageserver_http, ignored_tenant_id, "Active", 5)
+
+    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_attach.sort()
+    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
+
+    timelines_after_ignore = [
+        timeline["timeline_id"]
+        for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
+    ]
+    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
+
+
+# Tests that it's possible to `load` tenants with missing layers and get them restored:
+# * writes some data into tenant's timeline
+# * ensures it's synced with the remote storage
+# * `ignore` the tenant
+# * removes all timeline's local layers
+# * `load` the same tenant
+# * ensure that it's status is `Active`
+# * check that timeline data is restored
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ignored_tenant_download_missing_layers(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ignored_tenant_download_and_attach",
+    )
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    pg = env.postgres.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    data_id = 1
+    data_secret = "very secret secret"
+    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
+
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_before_ignore.sort()
+    timelines_before_ignore = [
+        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
+    ]
+
+    # ignore the tenant and remove its layers
+    pageserver_http.tenant_ignore(tenant_id)
+    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    layers_removed = False
+    for dir_entry in tenant_timeline_dir.iterdir():
+        if dir_entry.name.startswith("00000"):
+            # Looks like a layer file. Remove it
+            dir_entry.unlink()
+            layers_removed = True
+    assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"
+
+    # now, load it from the local files and expect it to work due to remote storage restoration
+    pageserver_http.tenant_load(tenant_id=tenant_id)
+    wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
+
+    tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    tenants_after_attach.sort()
+    assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
+
+    timelines_after_ignore = [
+        timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
+    ]
+    assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
+
+    pg.stop()
+    pg.start()
+    ensure_test_data(data_id, data_secret, pg)
+
+
+# Tests that it's possible to `load` broken tenants:
+# * `ignore` a tenant
+# * removes its `metadata` file locally
+# * `load` the same tenant
+# * ensure that it's status is `Broken`
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ignored_tenant_stays_broken_without_metadata(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ignored_tenant_stays_broken_without_metadata",
+    )
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    pg = env.postgres.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    # ignore the tenant and remove its metadata
+    pageserver_http.tenant_ignore(tenant_id)
+    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    metadata_removed = False
+    for dir_entry in tenant_timeline_dir.iterdir():
+        if dir_entry.name == "metadata":
+            # Looks like a layer file. Remove it
+            dir_entry.unlink()
+            metadata_removed = True
+    assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
+
+    env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")
+
+    # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
+    pageserver_http.tenant_load(tenant_id=tenant_id)
+    wait_until_tenant_status(pageserver_http, tenant_id, "Broken", 5)
+
+
+# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
+# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_load_attach_negatives(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_load_attach_negatives",
+    )
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    pg = env.postgres.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+
+    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
+    with pytest.raises(
+        expected_exception=PageserverApiException,
+        match=f"tenant {tenant_id} already exists, state: Active",
+    ):
+        pageserver_http.tenant_load(tenant_id)
+
+    with pytest.raises(
+        expected_exception=PageserverApiException,
+        match=f"tenant {tenant_id} already exists, state: Active",
+    ):
+        pageserver_http.tenant_attach(tenant_id)
+
+    pageserver_http.tenant_ignore(tenant_id)
+
+    env.pageserver.allowed_errors.append(
+        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
+    )
+    with pytest.raises(
+        expected_exception=PageserverApiException,
+        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
+    ):
+        pageserver_http.tenant_attach(tenant_id)
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ignore_while_attaching(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ignore_while_attaching",
+    )
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    pg = env.postgres.create_start("main")
+
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    data_id = 1
+    data_secret = "very secret secret"
+    insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
+
+    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+
+    # Detach it
+    pageserver_http.tenant_detach(tenant_id)
+    # And re-attach, but stop attach task_mgr task from completing
+    pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
+    pageserver_http.tenant_attach(tenant_id)
+    # Run ignore on the task, thereby cancelling the attach.
+    # XXX This should take priority over attach, i.e., it should cancel the attach task.
+    # But neither the failpoint, nor the proper storage_sync2 download functions,
+    # are sensitive to task_mgr::shutdown.
+    # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
+    # So, for now, effectively, this ignore here will block until attach task completes.
+    pageserver_http.tenant_ignore(tenant_id)
+
+    # Cannot attach it due to some local files existing
+    env.pageserver.allowed_errors.append(
+        ".*Cannot attach tenant .*?, local tenant directory already exists.*"
+    )
+    with pytest.raises(
+        expected_exception=PageserverApiException,
+        match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
+    ):
+        pageserver_http.tenant_attach(tenant_id)
+
+    tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
+    assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
+    assert len(tenants_after_ignore) + 1 == len(
+        tenants_before_ignore
+    ), "Only ignored tenant should be missing"
+
+    # But can load it from local files, that will restore attach.
+    pageserver_http.tenant_load(tenant_id)
+
+    wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
+
+    pg.stop()
+    pg.start()
+    ensure_test_data(data_id, data_secret, pg)
+
+
+def insert_test_data(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    data_id: int,
+    data: str,
+    pg: Postgres,
+):
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+            CREATE TABLE test(id int primary key, secret text);
+            INSERT INTO test VALUES ({data_id}, '{data}');
+        """
+        )
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+    # run checkpoint manually to be sure that data landed in remote storage
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    log.info("waiting for to be ignored tenant data checkpoint upload")
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+
+def ensure_test_data(data_id: int, data: str, pg: Postgres):
+    with pg.cursor() as cur:
+        assert (
+            query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
+        ), "Should have timeline data back"
+
+
+# Does not use `wait_until` for debugging purposes
+def wait_until_tenant_status(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    expected_status: str,
+    iterations: int,
+) -> bool:
+    for _ in range(iterations):
+        try:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            log.debug(f"Tenant {tenant_id} status: {tenant}")
+            if tenant["state"] == expected_status:
+                return True
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} status retrieval failure: {e}")
+
+        time.sleep(1)
+
+    raise Exception(f"Tenant {tenant_id} did not become {expected_status} in {iterations} seconds")

From 98ff0396f83e9686589ec47027bb1737737b9f82 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Mon, 5 Dec 2022 13:31:25 +0200
Subject: [PATCH 016/167] tone down error log for successful process
 termination

---
 pageserver/src/walredo.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index dfcd49f5c2..378f8deed7 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -922,8 +922,7 @@ impl NoLeakChild {
 
         match child.wait() {
             Ok(exit_status) => {
-                // log at error level since .kill() is something we only do on errors ATM
-                error!(exit_status = %exit_status, "wait successful");
+                info!(exit_status = %exit_status, "wait successful");
             }
             Err(e) => {
                 error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");

From 4530544bb849e1769d7a845cb079f3757f7afca4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Dec 2022 11:03:04 -0500
Subject: [PATCH 017/167] draw_timeline_dirs: accept paths as input

---
 pageserver/src/bin/draw_timeline_dir.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs
index ea1ff7f3c7..da13ee452c 100644
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -11,8 +11,8 @@
 //!
 //! Example use:
 //! ```
-//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
-//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//! $   grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
 //! $ firefox out.svg
 //! ```
 //!
@@ -25,6 +25,8 @@ use anyhow::Result;
 use pageserver::repository::Key;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
+use std::path::PathBuf;
+use std::str::FromStr;
 use std::{
     collections::{BTreeMap, BTreeSet},
     ops::Range,
@@ -65,7 +67,11 @@ fn main() -> Result<()> {
     let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
     let stdin = io::stdin();
     for line in stdin.lock().lines() {
-        let range = parse_filename(&line.unwrap());
+        let line = line.unwrap();
+        let line = PathBuf::from_str(&line).unwrap();
+        let filename = line.file_name().unwrap();
+        let filename = filename.to_str().unwrap();
+        let range = parse_filename(filename);
         ranges.push(range);
     }
 

From 8f2b3cbded81e94ff7c4575afa9f50a560de0849 Mon Sep 17 00:00:00 2001
From: Kliment Serafimov <serafimovkliment@gmail.com>
Date: Tue, 6 Dec 2022 19:57:54 +0100
Subject: [PATCH 018/167] Sentry integration for storage. (#2926)

Added basic instrumentation to integrate sentry with the proxy, pageserver, and safekeeper processes.
Currently in sentry there are three projects, one for each process. Sentry url is sent to all three processes separately via cli args.
---
 .github/ansible/systemd/pageserver.service |   2 +-
 .github/ansible/systemd/safekeeper.service |   2 +-
 .github/workflows/build_and_test.yml       |  20 +-
 Cargo.lock                                 | 266 ++++++++++++++++++++-
 libs/utils/Cargo.toml                      |   1 +
 libs/utils/src/lib.rs                      |   1 +
 libs/utils/src/sentry_init.rs              |  27 +++
 pageserver/src/bin/pageserver.rs           |   4 +
 pageserver/src/config.rs                   |   8 +-
 proxy/src/main.rs                          |   4 +
 safekeeper/src/bin/safekeeper.rs           |   7 +-
 workspace_hack/Cargo.toml                  |   4 +-
 12 files changed, 324 insertions(+), 22 deletions(-)
 create mode 100644 libs/utils/src/sentry_init.rs

diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service
index 39e57dbd6c..653e2dc142 100644
--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=pageserver
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
 ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service
index 69827e36ac..7eaed100d8 100644
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=safekeeper
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
 ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1701e02dcb..9ec2d919be 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -732,7 +732,7 @@ jobs:
           ssh-add ssh-key
           rm -f ssh-key ssh-key-cert.pub
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
+          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-new:
@@ -770,7 +770,7 @@ jobs:
             exit 1
           fi
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-pr-test-new:
@@ -803,7 +803,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-prod-new:
@@ -843,7 +843,7 @@ jobs:
           fi
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-proxy:
@@ -885,8 +885,8 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-proxy-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -925,19 +925,19 @@ jobs:
       - name: Re-deploy scram proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy link proxy
         if: matrix.deploy_link_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy legacy scram proxy
         if: matrix.deploy_legacy_scram_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -974,7 +974,7 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]
diff --git a/Cargo.lock b/Cargo.lock
index 3bcd189d9e..12ab6f17aa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -724,6 +724,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "chunked_transfer"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fff857943da45f546682664a79488be82e69e43c1a7a2307679ab9afb3a66d2e"
+
 [[package]]
 name = "ciborium"
 version = "0.2.0"
@@ -1225,6 +1231,16 @@ dependencies = [
  "uuid 0.8.2",
 ]
 
+[[package]]
+name = "debugid"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
+dependencies = [
+ "serde",
+ "uuid 1.2.1",
+]
+
 [[package]]
 name = "der-parser"
 version = "8.1.0"
@@ -1394,6 +1410,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -1667,6 +1698,17 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "hostname"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
+dependencies = [
+ "libc",
+ "match_cfg",
+ "winapi",
+]
+
 [[package]]
 name = "http"
 version = "0.2.8"
@@ -1774,6 +1816,19 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
+[[package]]
+name = "hyper-tls"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
+dependencies = [
+ "bytes",
+ "hyper",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.51"
@@ -1999,6 +2054,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "match_cfg"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -2111,6 +2172,24 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "nb"
 version = "0.1.3"
@@ -2281,12 +2360,62 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
+[[package]]
+name = "openssl"
+version = "0.10.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "020433887e44c27ff16365eaa2d380547a94544ad509aff6eb5b6e3e0b27b376"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "openssl-sys"
+version = "0.9.78"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07d5c8cb6e57b3a3612064d7b18b117912b4ce70955c2504d4b741c9e244b132"
+dependencies = [
+ "autocfg",
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "os_info"
+version = "3.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4750134fb6a5d49afc80777394ad5d95b04bc12068c6abb92fae8f43817270f"
+dependencies = [
+ "log",
+ "serde",
+ "winapi",
+]
+
 [[package]]
 name = "os_str_bytes"
 version = "6.3.0"
@@ -2508,6 +2637,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
+[[package]]
+name = "pkg-config"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+
 [[package]]
 name = "plotters"
 version = "0.3.4"
@@ -3070,10 +3205,12 @@ dependencies = [
  "http-body",
  "hyper",
  "hyper-rustls",
+ "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
+ "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -3083,6 +3220,7 @@ dependencies = [
  "serde_json",
  "serde_urlencoded",
  "tokio",
+ "tokio-native-tls",
  "tokio-rustls",
  "tower-service",
  "url",
@@ -3419,6 +3557,89 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
 
+[[package]]
+name = "sentry"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6425e2a14006415449fb0a3e9a119df5032f59e7a2d9350cf8738eca290dfc5"
+dependencies = [
+ "httpdate",
+ "native-tls",
+ "reqwest",
+ "sentry-backtrace",
+ "sentry-contexts",
+ "sentry-core",
+ "sentry-panic",
+ "tokio",
+ "ureq",
+]
+
+[[package]]
+name = "sentry-backtrace"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04d79c194e5c20fe602e81faf39f3cff0f275ec61283f437a892cfd6544da592"
+dependencies = [
+ "backtrace",
+ "once_cell",
+ "regex",
+ "sentry-core",
+]
+
+[[package]]
+name = "sentry-contexts"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1c2a57601eeb870521cc241caee27e57a012f297ece3c1b7eee87f2a531edb5"
+dependencies = [
+ "hostname",
+ "libc",
+ "os_info",
+ "rustc_version 0.4.0",
+ "sentry-core",
+ "uname",
+]
+
+[[package]]
+name = "sentry-core"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8be90ea119c6d0664c8ab534013bc9e90355e7004d782d5d1492ca513393b929"
+dependencies = [
+ "once_cell",
+ "rand",
+ "sentry-types",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "sentry-panic"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec217c3290e3f0d128154da731c28efa8f62cf8e3c3a006fd4bc3407c959176"
+dependencies = [
+ "sentry-backtrace",
+ "sentry-core",
+]
+
+[[package]]
+name = "sentry-types"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67ad85f0addf16310a1fbcf3facc7acb17ef5dbf6ae059d2f3c38442a471404d"
+dependencies = [
+ "debugid 0.8.0",
+ "getrandom",
+ "hex",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "time",
+ "url",
+ "uuid 1.2.1",
+]
+
 [[package]]
 name = "serde"
 version = "1.0.145"
@@ -3706,7 +3927,7 @@ version = "8.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
 dependencies = [
- "debugid",
+ "debugid 0.7.3",
  "memmap2",
  "stable_deref_trait",
  "uuid 0.8.2",
@@ -3913,6 +4134,16 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.6"
@@ -4245,6 +4476,15 @@ version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
 
+[[package]]
+name = "uname"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b72f89f0ca32e4db1c04e2a72f5345d59796d4866a1ee0609084569f73683dc8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.8"
@@ -4284,6 +4524,20 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
+[[package]]
+name = "ureq"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
+dependencies = [
+ "base64",
+ "chunked_transfer",
+ "log",
+ "native-tls",
+ "once_cell",
+ "url",
+]
+
 [[package]]
 name = "url"
 version = "2.3.1"
@@ -4293,6 +4547,7 @@ dependencies = [
  "form_urlencoded",
  "idna",
  "percent-encoding",
+ "serde",
 ]
 
 [[package]]
@@ -4325,6 +4580,7 @@ dependencies = [
  "rustls",
  "rustls-pemfile",
  "rustls-split",
+ "sentry",
  "serde",
  "serde_json",
  "serde_with",
@@ -4368,6 +4624,12 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -4653,12 +4915,12 @@ dependencies = [
  "serde",
  "stable_deref_trait",
  "syn",
- "time",
  "tokio",
  "tokio-util",
  "tower",
  "tracing",
  "tracing-core",
+ "url",
 ]
 
 [[package]]
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 36a379b47a..47639e8205 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
+sentry = "0.29.0"
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index e1c9a373e5..b93afb0a59 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -46,6 +46,7 @@ pub mod tcp_listener;
 pub mod nonblock;
 
 // Default signal handling
+pub mod sentry_init;
 pub mod signals;
 
 pub mod fs_ext;
diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs
new file mode 100644
index 0000000000..4f1c297854
--- /dev/null
+++ b/libs/utils/src/sentry_init.rs
@@ -0,0 +1,27 @@
+use sentry::ClientInitGuard;
+use std::borrow::Cow;
+use std::env;
+
+pub use sentry::release_name;
+
+#[must_use]
+pub fn init_sentry(
+    release_name: Option<Cow<'static, str>>,
+    extra_options: &[(&str, &str)],
+) -> Option<ClientInitGuard> {
+    let dsn = env::var("SENTRY_DSN").ok()?;
+
+    let guard = sentry::init((
+        dsn,
+        sentry::ClientOptions {
+            release: release_name,
+            ..Default::default()
+        },
+    ));
+    sentry::configure_scope(|scope| {
+        for &(key, value) in extra_options {
+            scope.set_extra(key, value.into());
+        }
+    });
+    Some(guard)
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 6c774ae1ae..f55fe0886a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -26,6 +26,7 @@ use utils::{
     lock_file, logging,
     postgres_backend::AuthType,
     project_git_version,
+    sentry_init::{init_sentry, release_name},
     signals::{self, Signal},
     tcp_listener,
 };
@@ -85,6 +86,9 @@ fn main() -> anyhow::Result<()> {
         }
     };
 
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
         utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b3eab6c3cb..ab88dd0ad6 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -333,10 +333,6 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let broker_endpoints = self
-            .broker_endpoints
-            .ok_or(anyhow!("No broker endpoints provided"))?;
-
         Ok(PageServerConf {
             listen_pg_addr: self
                 .listen_pg_addr
@@ -372,7 +368,9 @@ impl PageServerConfigBuilder {
             profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
             // TenantConf is handled separately
             default_tenant_conf: TenantConf::default(),
-            broker_endpoints,
+            broker_endpoints: self
+                .broker_endpoints
+                .ok_or(anyhow!("No broker endpoints provided"))?,
             broker_etcd_prefix: self
                 .broker_etcd_prefix
                 .ok_or(anyhow!("missing broker_etcd_prefix"))?,
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 2055616a6e..2855d1f900 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -28,6 +28,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
 use tracing::info;
 use utils::project_git_version;
+use utils::sentry_init::{init_sentry, release_name};
 
 project_git_version!(GIT_VERSION);
 
@@ -45,6 +46,9 @@ async fn main() -> anyhow::Result<()> {
         .with_target(false)
         .init();
 
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[]);
+
     let arg_matches = cli().get_matches();
 
     let tls_config = match (
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 49e9e30cdc..8a2894b32d 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -35,11 +35,14 @@ use utils::{
     http::endpoint,
     id::NodeId,
     logging::{self, LogFormat},
-    project_git_version, signals, tcp_listener,
+    project_git_version,
+    sentry_init::{init_sentry, release_name},
+    signals, tcp_listener,
 };
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
+
 project_git_version!(GIT_VERSION);
 
 fn main() -> anyhow::Result<()> {
@@ -133,6 +136,8 @@ fn main() -> anyhow::Result<()> {
         conf.log_format = LogFormat::from_config(log_format)?;
     }
 
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
     start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
 }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 31e4426ac2..e50a559a4b 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,16 +37,16 @@ prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["pro
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] }
+reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 stable_deref_trait = { version = "1", features = ["alloc", "std"] }
-time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] }
 tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
 tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
 tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }
 tracing-core = { version = "0.1", features = ["once_cell", "std"] }
+url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
 ahash = { version = "0.7", features = ["std"] }

From 634d0eab68dea668a1b9671fa9cad8f2962f0761 Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Tue, 6 Dec 2022 21:09:54 +0200
Subject: [PATCH 019/167] pass availability zone to console during pageserver
 registration (#2991)

this is safe because unknown fields are ignored. After the corresponding PR in control plane is merged this field
is going to be required

Part of https://github.com/neondatabase/cloud/issues/3131
---
 .github/ansible/scripts/init_pageserver.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh
index 426925a837..e89fc5e667 100644
--- a/.github/ansible/scripts/init_pageserver.sh
+++ b/.github/ansible/scripts/init_pageserver.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 
-# get instance id from meta-data service
+# fetch params from meta-data service
 INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
 
 # store fqdn hostname in var
 HOST=$(hostname -f)
@@ -16,7 +17,8 @@ cat <<EOF | tee /tmp/payload
   "instance_id": "${INSTANCE_ID}",
   "http_host": "${HOST}",
   "http_port": 9898,
-  "active": false
+  "active": false,
+  "availability_zone_id": "${AZ_ID}"
 }
 EOF
 

From 09393279c657ee1a02ca89b4a3ef570160cf05ac Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 6 Dec 2022 20:45:29 +0200
Subject: [PATCH 020/167] Fix tenant config parsing

---
 pageserver/src/config.rs | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ab88dd0ad6..4542afae33 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -490,7 +490,7 @@ impl PageServerConf {
         let mut builder = PageServerConfigBuilder::default();
         builder.workdir(workdir.to_owned());
 
-        let mut t_conf: TenantConfOpt = Default::default();
+        let mut t_conf = TenantConfOpt::default();
 
         for (key, item) in toml.iter() {
             match key {
@@ -621,6 +621,12 @@ impl PageServerConf {
         if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
             t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
         }
+        if let Some(trace_read_requests) = item.get("trace_read_requests") {
+            t_conf.trace_read_requests =
+                Some(trace_read_requests.as_bool().with_context(|| {
+                    "configure option trace_read_requests is not a bool".to_string()
+                })?);
+        }
 
         Ok(t_conf)
     }
@@ -1020,6 +1026,35 @@ broker_endpoints = ['{broker_endpoint}']
         Ok(())
     }
 
+    #[test]
+    fn parse_tenant_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let broker_endpoint = "http://127.0.0.1:7777";
+        let trace_read_requests = true;
+
+        let config_string = format!(
+            r#"{ALL_BASE_VALUES_TOML}
+pg_distrib_dir='{}'
+broker_endpoints = ['{broker_endpoint}']
+
+[tenant_config]
+trace_read_requests = {trace_read_requests}"#,
+            pg_distrib_dir.display(),
+        );
+
+        let toml = config_string.parse()?;
+
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        assert_eq!(
+            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
+            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
+        );
+
+        Ok(())
+    }
+
     fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
         let tempdir_path = tempdir.path();
 

From 6a57d5bbf96e7577b28a5b5c5ef0d458c0abcd5c Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 6 Dec 2022 20:51:46 +0200
Subject: [PATCH 021/167] Make the request tracing test more useful

---
 .../{performance => regress}/test_read_trace.py      | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
 rename test_runner/{performance => regress}/test_read_trace.py (60%)

diff --git a/test_runner/performance/test_read_trace.py b/test_runner/regress/test_read_trace.py
similarity index 60%
rename from test_runner/performance/test_read_trace.py
rename to test_runner/regress/test_read_trace.py
index a5bd0b8de6..1b00b272c2 100644
--- a/test_runner/performance/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -1,10 +1,14 @@
 from contextlib import closing
 
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import query_scalar
 
 
 # This test demonstrates how to collect a read trace. It's useful until
 # it gets replaced by a test that actually does stuff with the trace.
+#
+# Additionally, tests that pageserver is able to create tenants with custom configs.
 def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 1
     env = neon_env_builder.init_start()
@@ -23,6 +27,12 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
             cur.execute("create table t (i integer);")
             cur.execute(f"insert into t values (generate_series(1,{10000}));")
             cur.execute("select count(*) from t;")
+            tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+            timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    # wait until pageserver receives that data
+    pageserver_http = env.pageserver.http_client()
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
 
     # Stop pg so we drop the connection and flush the traces
     pg.stop()

From b447eb4d1e7a3ea1d572183401ec34b7296d1b93 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 7 Dec 2022 12:56:42 +0200
Subject: [PATCH 022/167] Add postgres-v15 to source tree documentation (#3023)

---
 docs/sourcetree.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 309f5a6966..17e47b670c 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -45,9 +45,9 @@ and create new databases and accounts (control plane API in our case).
 
 Integration tests, written in Python using the `pytest` framework.
 
-`/vendor/postgres-v14`:
+`/vendor/postgres-v14` and `/vendor/postgres-v15`:
 
-PostgreSQL source tree, with the modifications needed for Neon.
+PostgreSQL source tree per version, with the modifications needed for Neon.
 
 `/pgxn/neon`:
 

From b51361950317ce873bfb99cf72d3cb3342565be5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 7 Dec 2022 12:27:27 +0200
Subject: [PATCH 023/167] Remove obsolete 'awaits_download' field.

It used to be a separate piece of state, but after 9a6c0be823 it's just
an alias for the Tenant being in Attaching state. It was only used in
one assertion in a test, but that check doesn't make sense anymore, so
just remove it.

Fixes https://github.com/neondatabase/neon/issues/2930
---
 libs/pageserver_api/src/models.rs          |  2 --
 pageserver/src/http/openapi_spec.yml       |  3 ---
 pageserver/src/http/routes.rs              | 17 +++--------------
 test_runner/regress/test_remote_storage.py |  1 -
 4 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9d1ad8a022..cf3252a9ae 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -201,8 +201,6 @@ pub struct TimelineInfo {
     pub last_received_msg_ts: Option<u128>,
     pub pg_version: u32,
 
-    pub awaits_download: bool,
-
     pub state: TimelineState,
 
     // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 932cda50b7..b372410c0d 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -748,7 +748,6 @@ components:
         - tenant_id
         - last_record_lsn
         - disk_consistent_lsn
-        - awaits_download
         - state
         - latest_gc_cutoff_lsn
       properties:
@@ -791,8 +790,6 @@ components:
           format: hex
         last_received_msg_ts:
           type: integer
-        awaits_download:
-          type: boolean
         state:
           type: string
         latest_gc_cutoff_lsn:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d1fdf26a5a..0ef555c4aa 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3,7 +3,6 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
-use pageserver_api::models::TenantState;
 use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
@@ -81,12 +80,11 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 
 // Helper function to construct a TimelineInfo struct for a timeline
 fn build_timeline_info(
-    tenant_state: TenantState,
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
     include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<TimelineInfo> {
-    let mut info = build_timeline_info_common(tenant_state, timeline)?;
+    let mut info = build_timeline_info_common(timeline)?;
     if include_non_incremental_logical_size {
         info.current_logical_size_non_incremental =
             Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
@@ -98,10 +96,7 @@ fn build_timeline_info(
     Ok(info)
 }
 
-fn build_timeline_info_common(
-    tenant_state: TenantState,
-    timeline: &Arc<Timeline>,
-) -> anyhow::Result<TimelineInfo> {
+fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -153,10 +148,6 @@ fn build_timeline_info_common(
 
         state,
 
-        // XXX bring back tracking of downloads per timeline, or, introduce
-        // an 'Attaching' state for the timeline and get rid of this field.
-        awaits_download: tenant_state == TenantState::Attaching,
-
         // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
         // with the control plane.
         local: LocalTimelineInfo {
@@ -201,7 +192,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     .await {
         Ok(Some(new_timeline)) => {
             // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
+            let timeline_info = build_timeline_info_common(&new_timeline)
                 .map_err(ApiError::InternalServerError)?;
             json_response(StatusCode::CREATED, timeline_info)
         }
@@ -227,7 +218,6 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
             let timeline_info = build_timeline_info(
-                tenant.current_state(),
                 &timeline,
                 include_non_incremental_logical_size,
                 include_non_incremental_physical_size,
@@ -295,7 +285,6 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .map_err(ApiError::NotFound)?;
 
         let timeline_info = build_timeline_info(
-            tenant.current_state(),
             &timeline,
             include_non_incremental_logical_size,
             include_non_incremental_physical_size,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index d1b23123b5..506955b1df 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -165,7 +165,6 @@ def test_remote_storage_backup_and_restore(
     assert (
         Lsn(detail["last_record_lsn"]) >= current_lsn
     ), "current db Lsn should should not be less than the one stored on remote storage"
-    assert not detail["awaits_download"]
 
     pg = env.postgres.create_start("main")
     with pg.cursor() as cur:

From c74dca95fc1152b642c29b15a20ea65d6d1b3a78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Wed, 7 Dec 2022 14:24:07 +0200
Subject: [PATCH 024/167] Helm values for old staging and one region in new
 staging (#2922)

helm values for the new `storage-broker`. gRPC, over secure connection
with a proper certificate, but no authentication.

Uses alb ingress in the old cluster and nginx ingress for the new one.

The chart is deployed and the addresses are functional, while the
pipeline doesn't exist yet.
---
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml |  57 +++++++++
 ...ev-us-east-2-beta.neon-storage-broker.yaml |  57 +++++++++
 .../neon-stress.neon-storage-broker.yaml      |  54 ++++++++
 ...u-central-1-gamma.neon-storage-broker.yaml |  57 +++++++++
 ...d-us-east-2-delta.neon-storage-broker.yaml |  57 +++++++++
 .../production.neon-storage-broker.yaml       |  54 ++++++++
 .../staging.neon-storage-broker.yaml          |  54 ++++++++
 .github/workflows/build_and_test.yml          | 119 +++++++++++++++++-
 8 files changed, 506 insertions(+), 3 deletions(-)
 create mode 100644 .github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/neon-stress.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/production.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/staging.neon-storage-broker.yaml

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
new file mode 100644
index 0000000000..2ce8c45a1b
--- /dev/null
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -0,0 +1,57 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker-zeta.eu-west-1.aws.neon.build
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker-zeta.eu-west-1.aws.neon.build
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
new file mode 100644
index 0000000000..6b75ffd75c
--- /dev/null
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -0,0 +1,57 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker-beta.us-east-2.aws.neon.build
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker-beta.us-east-2.aws.neon.build
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml
new file mode 100644
index 0000000000..b141246df0
--- /dev/null
+++ b/.github/helm-values/neon-stress.neon-storage-broker.yaml
@@ -0,0 +1,54 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: neon-stress
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/healthcheck-path: /status
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+    alb.ingress.kubernetes.io/scheme: "internal"
+    alb.ingress.kubernetes.io/target-type: "ip"
+    alb.ingress.kubernetes.io/ssl-redirect: "443"
+    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+
+  hosts:
+    - host: storage-broker-stress.stage.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
new file mode 100644
index 0000000000..6b8b0c2f83
--- /dev/null
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -0,0 +1,57 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker-gamma.eu-central-1.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker-gamma.eu-central-1.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
new file mode 100644
index 0000000000..c6266b95f9
--- /dev/null
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -0,0 +1,57 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker-delta.us-east-2.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker-delta.us-east-2.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml
new file mode 100644
index 0000000000..299d6fa89e
--- /dev/null
+++ b/.github/helm-values/production.neon-storage-broker.yaml
@@ -0,0 +1,54 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/healthcheck-path: /status
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+    alb.ingress.kubernetes.io/scheme: "internal"
+    alb.ingress.kubernetes.io/target-type: "ip"
+    alb.ingress.kubernetes.io/ssl-redirect: "443"
+    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+
+  hosts:
+    - host: storage-broker.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml
new file mode 100644
index 0000000000..54e1e1bba2
--- /dev/null
+++ b/.github/helm-values/staging.neon-storage-broker.yaml
@@ -0,0 +1,54 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/healthcheck-path: /status
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+    alb.ingress.kubernetes.io/scheme: "internal"
+    alb.ingress.kubernetes.io/target-type: "ip"
+    alb.ingress.kubernetes.io/ssl-redirect: "443"
+    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+
+  hosts:
+    - host: storage-broker.stage.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9ec2d919be..a432e875dd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -668,11 +668,11 @@ jobs:
       - id: set-matrix
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
+            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY", storage_broker_config: }'
             echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
+            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
@@ -888,6 +888,47 @@ jobs:
           helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
           helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
+  deploy-storage-broker-staging:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Add curl
+        run: apt update && apt install curl -y
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Setup helm v3
+        run: |
+          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Deploy storage-broker
+        run:
+          DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
   deploy-proxy-new:
     runs-on: [ self-hosted, dev, x64 ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
@@ -939,6 +980,41 @@ jobs:
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
           helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
+  deploy-storage-broker-dev-new:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
   deploy-proxy-prod-new:
     runs-on: prod
     container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -976,6 +1052,43 @@ jobs:
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
           helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
+  deploy-storage-broker-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]
     container:

From a46a81b5cbb0f4b9022152c609428247e288804d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 7 Dec 2022 12:35:31 +0200
Subject: [PATCH 025/167] Fix updating "trace_read_requests" with
 /v1/tenant/config mgmt API.

The new "trace_read_requests" option was missing from the
parse_toml_tenant_conf function that reads the config file. Because of
that, the option was ignored, which caused the test_read_trace.py test
to fail. It used to work before commit 9a6c0be823, because the
TenantConfigOpt struct was constructed directly in tenant_create_handler,
but now it is saved and read back from disk even for a newly created
tenant.

The abovementioned bug was fixed in commit 09393279c6 already, which
added the missing code to parse_toml_tenant_conf() to parse the
new "trace_read_requests" option. This commit fixes one more function
that was missed earlier, and adds more detail to the error message if
parsing the config file fails.
---
 pageserver/src/tenant.rs        | 2 +-
 pageserver/src/tenant_config.rs | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 87f92402b1..88476cf7b6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -782,7 +782,7 @@ impl Tenant {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
             Err(e) => {
-                error!("load tenant config failed: {}", e);
+                error!("load tenant config failed: {:?}", e);
                 return Tenant::create_broken_tenant(conf, tenant_id);
             }
         };
diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs
index 10b8a589c3..1204d1abd8 100644
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -185,6 +185,9 @@ impl TenantConfOpt {
         if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
             self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
         }
+        if let Some(trace_read_requests) = other.trace_read_requests {
+            self.trace_read_requests = Some(trace_read_requests);
+        }
     }
 }
 

From 6dfd7cb1d0ac10bb0c248f9ed800b81d7707fc6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Wed, 7 Dec 2022 17:15:51 +0200
Subject: [PATCH 026/167] Neon storage broker helm value fixes (#3025)

* We were missing one cluster in production:
`prod-ap-southeast-1-epsilon` configs.
* We had `metrics` enabled. This means creating `ServiceScrape` objects,
but since those clusters don't have `kube-prometheus-stack` like older
ones, we are missing the CRDs, so the helm deploy fails.
---
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml |  6 +--
 ...ev-us-east-2-beta.neon-storage-broker.yaml |  6 +--
 ...utheast-1-epsilon.neon-storage-broker.yaml | 53 +++++++++++++++++++
 ...u-central-1-gamma.neon-storage-broker.yaml |  6 +--
 ...d-us-east-2-delta.neon-storage-broker.yaml |  6 +--
 5 files changed, 57 insertions(+), 20 deletions(-)
 create mode 100644 .github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index 2ce8c45a1b..296785635c 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -24,11 +24,7 @@ ingress:
 
 
 metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+  enabled: false
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index 6b75ffd75c..f197d2e579 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -24,11 +24,7 @@ ingress:
 
 
 metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+  enabled: false
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
new file mode 100644
index 0000000000..959033939a
--- /dev/null
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker-epsilon.ap-southeast-1.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker-epsilon.ap-southeast-1.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index 6b8b0c2f83..1184ff442c 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -24,11 +24,7 @@ ingress:
 
 
 metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+  enabled: false
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index c6266b95f9..651b87b96a 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -24,11 +24,7 @@ ingress:
 
 
 metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+  enabled: false
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1

From ac0c167a858029ecbb31737d26c1fd8de6da1d35 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 1 Dec 2022 07:05:20 -0500
Subject: [PATCH 027/167] improve pidfile handling

This patch centralize the logic of creating & reading pid files into the
new pid_file module and improves upon / makes explicit a few race conditions
that existed with the previous code.

Starting Processes / Creating Pidfiles
======================================

Before this patch, we had three places that had very similar-looking
    match lock_file::create_lock_file { ... }
blocks.
After this change, they can use a straight-forward call provided
by the pid_file:
    pid_file::claim_pid_file_for_pid()

Stopping Processes / Reading Pidfiles
=====================================

The new pid_file module provides a function to read a pidfile,
called read_pidfile(), that returns a

  pub enum PidFileRead {
      NotExist,
      NotHeldByAnyProcess(PidFileGuard),
      LockedByOtherProcess(Pid),
  }

If we get back NotExist, there is nothing to kill.

If we get back NotHeldByAnyProcess, the pid file is stale and we must
ignore its contents.

If it's LockedByOtherProcess, it's either another pidfile reader
or, more likely, the daemon that is still running.
In this case, we can read the pid in the pidfile and kill it.
There's still a small window where this is racy, but it's not a
regression compared to what we have before.

The NotHeldByAnyProcess is an improvement over what we had before
this patch. Before, we would blindly read the pidfile contents
and kill, even if no other process held the flock.
If the pidfile was stale (NotHeldByAnyProcess), then that kill
would either result in ESRCH or hit some other unrelated process
on the system. This patch avoids the latter cacse by grabbing
an exclusive flock before reading the pidfile, and returning the
flock to the caller in the form of a guard object, to avoid
concurrent reads / kills.
It's hopefully irrelevant in practice, but it's a little robustness
that we get for free here.

Maintain flock on Pidfile of ETCD / any InitialPidFile::Create()
================================================================

Pageserver and safekeeper create their pidfiles themselves.
But for etcd, neon_local creates the pidfile (InitialPidFile::Create()).

Before this change, we would unlock the etcd pidfile as soon as
`neon_local start` exits, simply because no-one else kept the FD open.

During `neon_local stop`, that results in a stale pid file,
aka, NotHeldByAnyProcess, and it would henceforth not trust that
the PID stored in the file is still valid.

With this patch, we make the etcd process inherit the pidfile FD,
thereby keeping the flock held until it exits.
---
 control_plane/src/background_process.rs | 160 +++++++++++++--------
 control_plane/src/bin/neon_local.rs     |   2 +-
 libs/utils/src/lib.rs                   |   1 +
 libs/utils/src/lock_file.rs             | 180 +++++++++++++++---------
 libs/utils/src/pid_file.rs              | 165 ++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs        |  28 +---
 safekeeper/src/bin/safekeeper.rs        |  28 +---
 7 files changed, 400 insertions(+), 164 deletions(-)
 create mode 100644 libs/utils/src/pid_file.rs

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index d21a939cb7..1a5ac1e2fe 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,17 +14,19 @@
 
 use std::ffi::OsStr;
 use std::io::Write;
-use std::path::Path;
+use std::os::unix::prelude::AsRawFd;
+use std::os::unix::process::CommandExt;
+use std::path::{Path, PathBuf};
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};
 
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::Context;
 use nix::errno::Errno;
+use nix::fcntl::{FcntlArg, FdFlag};
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-
-use utils::lock_file;
+use utils::pid_file::{self, PidFileRead};
 
 // These constants control the loop used to poll for process start / stop.
 //
@@ -86,6 +88,14 @@ where
     let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
     filled_cmd.envs(envs);
 
+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(path) => {
+            pre_exec_create_pidfile(filled_cmd, path);
+            path
+        }
+        InitialPidFile::Expect(path) => path,
+    };
+
     let mut spawned_process = filled_cmd.spawn().with_context(|| {
         format!("Could not spawn {process_name}, see console output and log files for details.")
     })?;
@@ -95,29 +105,8 @@ where
             .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
     );
 
-    let pid_file_to_check = match initial_pid_file {
-        InitialPidFile::Create(target_pid_file_path) => {
-            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
-                lock_file::LockCreationResult::Created { .. } => {
-                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
-                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
-                }
-                lock_file::LockCreationResult::AlreadyLocked { .. } => {
-                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
-                }
-                lock_file::LockCreationResult::CreationFailed(e) => {
-                    return Err(e.context(format!(
-                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
-                )))
-                }
-            }
-            None
-        }
-        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
-    };
-
     for retries in 0..RETRIES {
-        match process_started(pid, pid_file_to_check, &process_status_check) {
+        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
             Ok(true) => {
                 println!("\n{process_name} started, pid: {pid}");
                 return Ok(spawned_process);
@@ -165,12 +154,27 @@ pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()
 
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
 pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
-    if !pid_file.exists() {
-        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
-        return Ok(());
-    }
-    let pid = read_pidfile(pid_file)?;
+    let pid = match pid_file::read(pid_file)
+        .with_context(|| format!("read pid_file {pid_file:?}"))?
+    {
+        PidFileRead::NotExist => {
+            println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
+            return Ok(());
+        }
+        PidFileRead::NotHeldByAnyProcess(_) => {
+            // Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
+            // Don't delete the file either, it can race with new pid file creation.
+            // Read `pid_file` module comment for details.
+            println!(
+                "No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
+            );
+            return Ok(());
+        }
+        PidFileRead::LockedByOtherProcess(pid) => pid,
+    };
+    // XXX the pid could become invalid (and recycled) at any time before the kill() below.
 
+    // send signal
     let sig = if immediate {
         print!("Stopping {process_name} with pid {pid} immediately..");
         Signal::SIGQUIT
@@ -182,8 +186,9 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
     match kill(pid, sig) {
         Ok(()) => (),
         Err(Errno::ESRCH) => {
+            // Again, don't delete the pid file. The unlink can race with a new pid file being created.
             println!(
-                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
             );
             return Ok(());
         }
@@ -252,6 +257,69 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
     cmd
 }
 
+/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
+/// 1. Claims a pidfile with a fcntl lock on it and
+/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
+///    will remain held until the cmd exits.
+fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
+where
+    P: Into<PathBuf>,
+{
+    let path: PathBuf = path.into();
+    // SAFETY
+    // pre_exec is marked unsafe because it runs between fork and exec.
+    // Why is that dangerous in various ways?
+    // Long answer:  https://github.com/rust-lang/rust/issues/39575
+    // Short answer: in a multi-threaded program, other threads may have
+    // been inside of critical sections at the time of fork. In the
+    // original process, that was allright, assuming they protected
+    // the critical sections appropriately, e.g., through locks.
+    // Fork adds another process to the mix that
+    //   1. Has a single thread T
+    //   2. In an exact copy of the address space at the time of fork.
+    // A variety of problems scan occur now:
+    //   1. T tries to grab a lock that was locked at the time of fork.
+    //      It will wait forever since in its address space, the lock
+    //      is in state 'taken' but the thread that would unlock it is
+    //      not there.
+    //   2. A rust object that represented some external resource in the
+    //      parent now got implicitly copied by the the fork, even though
+    //      the object's type is not `Copy`. The parent program may use
+    //      non-copyability as way to enforce unique ownership of an
+    //      external resource in the typesystem. The fork breaks that
+    //      assumption, as now both parent and child process have an
+    //      owned instance of the object that represents the same
+    //      underlying resource.
+    // While these seem like niche problems, (1) in particular is
+    // highly relevant. For example, `malloc()` may grab a mutex internally,
+    // and so, if we forked while another thread was mallocing' and our
+    // pre_exec closure allocates as well, it will block on the malloc
+    // mutex forever
+    //
+    // The proper solution is to only use C library functions that are marked
+    // "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
+    //
+    // With this specific pre_exec() closure, the non-error path doesn't allocate.
+    // The error path uses `anyhow`, and hence does allocate.
+    // We take our chances there, hoping that any potential disaster is constrained
+    // to the child process (e.g., malloc has no state ourside of the child process).
+    // Last, `expect` prints to stderr, and stdio is not async-signal-safe.
+    // Again, we take our chances, making the same assumptions as for malloc.
+    unsafe {
+        cmd.pre_exec(move || {
+            let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
+            // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
+            // remains locked after exec.
+            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+                .expect("remove FD_CLOEXEC");
+            // Don't run drop(file), it would close the file before we actually exec.
+            std::mem::forget(file);
+            Ok(())
+        });
+    }
+    cmd
+}
+
 fn process_started<F>(
     pid: Pid,
     pid_file_to_check: Option<&Path>,
@@ -262,14 +330,11 @@ where
 {
     match status_check() {
         Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => {
-                if pid_file_path.exists() {
-                    let pid_in_file = read_pidfile(pid_file_path)?;
-                    Ok(pid_in_file == pid)
-                } else {
-                    Ok(false)
-                }
-            }
+            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
+                PidFileRead::NotExist => Ok(false),
+                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
+            },
             None => Ok(true),
         },
         Ok(false) => Ok(false),
@@ -277,21 +342,6 @@ where
     }
 }
 
-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-fn read_pidfile(pidfile: &Path) -> Result<Pid> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
-    if pid < 1 {
-        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
-    }
-    Ok(Pid::from_raw(pid))
-}
-
 fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
     match kill(pid, None) {
         // Process exists, keep waiting
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 42a9199037..99ddae862d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -324,7 +324,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
             pg_version,
         )
         .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e}");
+            eprintln!("pageserver init failed: {e:?}");
             exit(1);
         });
 
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b93afb0a59..6d35fd9f7b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -34,6 +34,7 @@ pub mod sock_split;
 pub mod logging;
 
 pub mod lock_file;
+pub mod pid_file;
 
 // Misc
 pub mod accum;
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 4fef65852b..adbf47eb7a 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,81 +1,133 @@
-//! A module to create and read lock files. A lock file ensures that only one
-//! process is running at a time, in a particular directory.
+//! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`], which means that holding the
-//! lock on file only prevents acquiring another lock on it; all other
-//! operations are still possible on files. Other process can still open, read,
-//! write, or remove the file, for example.
-//! If the file is removed while a process is holding a lock on it,
-//! the process that holds the lock does not get any error or notification.
-//! Furthermore, you can create a new file with the same name and lock the new file,
-//! while the old process is still running.
-//! Deleting the lock file while the locking process is still running is a bad idea!
+//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).
 
-use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+use std::{
+    fs,
+    io::{Read, Write},
+    ops::Deref,
+    os::unix::prelude::AsRawFd,
+    path::{Path, PathBuf},
+};
 
 use anyhow::Context;
-use nix::fcntl;
+use nix::{errno::Errno::EAGAIN, fcntl};
 
 use crate::crashsafe;
 
-pub enum LockCreationResult {
-    Created {
-        new_lock_contents: String,
-        file: fs::File,
-    },
-    AlreadyLocked {
-        existing_lock_contents: String,
-    },
-    CreationFailed(anyhow::Error),
+/// A handle to an open and unlocked, but not-yet-written lock file.
+/// Returned by [`create_exclusive`].
+#[must_use]
+pub struct UnwrittenLockFile {
+    path: PathBuf,
+    file: fs::File,
 }
 
-/// Creates a lock file in the path given and writes the given contents into the file.
-/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
-pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
-    let lock_file = match fs::OpenOptions::new()
+/// Returned by [`UnwrittenLockFile::write_content`].
+#[must_use]
+pub struct LockFileGuard(fs::File);
+
+impl Deref for LockFileGuard {
+    type Target = fs::File;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl UnwrittenLockFile {
+    /// Replace the content of this lock file with the byte representation of `contents`.
+    pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
+        self.file
+            .set_len(0)
+            .context("Failed to truncate lockfile")?;
+        self.file
+            .write_all(contents.as_bytes())
+            .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
+        crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
+        Ok(LockFileGuard(self.file))
+    }
+}
+
+/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns
+/// a handle that allows overwriting the locked file's content.
+///
+/// The exclusive lock is released when dropping the returned handle.
+///
+/// It is not an error if the file already exists.
+/// It is an error if the file is already locked.
+pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
+    let lock_file = fs::OpenOptions::new()
         .create(true) // O_CREAT
         .write(true)
         .open(lock_file_path)
-        .context("Failed to open lock file")
-    {
-        Ok(file) => file,
-        Err(e) => return LockCreationResult::CreationFailed(e),
-    };
+        .context("open lock file")?;
 
-    match fcntl::flock(
+    let res = fcntl::flock(
         lock_file.as_raw_fd(),
         fcntl::FlockArg::LockExclusiveNonblock,
-    ) {
-        Ok(()) => {
-            match lock_file
-                .set_len(0)
-                .context("Failed to truncate lockfile")
-                .and_then(|()| {
-                    fs::write(lock_file_path, &contents).with_context(|| {
-                        format!("Failed to write '{contents}' contents into lockfile")
-                    })
-                })
-                .and_then(|()| {
-                    crashsafe::fsync_file_and_parent(lock_file_path)
-                        .context("Failed to fsync lockfile")
-                }) {
-                Ok(()) => LockCreationResult::Created {
-                    new_lock_contents: contents,
-                    file: lock_file,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(nix::errno::Errno::EAGAIN) => {
-            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
-                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
-                    existing_lock_contents,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(e) => {
-            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
-        }
+    );
+    match res {
+        Ok(()) => Ok(UnwrittenLockFile {
+            path: lock_file_path.to_owned(),
+            file: lock_file,
+        }),
+        Err(EAGAIN) => anyhow::bail!("file is already locked"),
+        Err(e) => Err(e).context("flock error"),
+    }
+}
+
+/// Returned by [`read_and_hold_lock_file`].
+/// Check out the [`pid_file`] module for what the variants mean
+/// and potential caveats if the lock files that are used to store PIDs.
+pub enum LockFileRead {
+    /// No file exists at the given path.
+    NotExist,
+    /// No other process held the lock file, so we grabbed an flock
+    /// on it and read its contents.
+    /// Release the flock by dropping the [`LockFileGuard`].
+    NotHeldByAnyProcess(LockFileGuard, String),
+    /// The file exists but another process was holding an flock on it.
+    LockedByOtherProcess {
+        not_locked_file: fs::File,
+        content: String,
+    },
+}
+
+/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// Check the [`LockFileRead`] variants for details.
+pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
+    let res = fs::OpenOptions::new().read(true).open(path);
+    let mut lock_file = match res {
+        Ok(f) => f,
+        Err(e) => match e.kind() {
+            std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
+            _ => return Err(e).context("open lock file"),
+        },
+    };
+    let res = fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    );
+    // We need the content regardless of lock success / failure.
+    // But, read it after flock so that, if it succeeded, the content is consistent.
+    let mut content = String::new();
+    lock_file
+        .read_to_string(&mut content)
+        .context("read lock file")?;
+    match res {
+        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
+            LockFileGuard(lock_file),
+            content,
+        )),
+        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
+            not_locked_file: lock_file,
+            content,
+        }),
+        Err(e) => Err(e).context("flock error"),
     }
 }
diff --git a/libs/utils/src/pid_file.rs b/libs/utils/src/pid_file.rs
new file mode 100644
index 0000000000..e634b08f2a
--- /dev/null
+++ b/libs/utils/src/pid_file.rs
@@ -0,0 +1,165 @@
+//! Abstraction to create & read pidfiles.
+//!
+//! A pidfile is a file in the filesystem that stores a process's PID.
+//! Its purpose is to implement a singleton behavior where only
+//! one process of some "kind" is supposed to be running at a given time.
+//! The "kind" is identified by the pidfile.
+//!
+//! During process startup, the process that is supposed to be a singleton
+//! must [claim][`claim_for_current_process`] the pidfile first.
+//! If that is unsuccessful, the process must not act as the singleton, i.e.,
+//! it must not access any of the resources that only the singleton may access.
+//!
+//! A common need is to signal a running singleton process, e.g., to make
+//! it shut down and exit.
+//! For that, we have to [`read`] the pidfile. The result of the `read` operation
+//! tells us if there is any singleton process, and if so, what PID it has.
+//! We can then proceed to signal it, although some caveats still apply.
+//! Read the function-level documentation of [`read`] for that.
+//!
+//! ## Never Remove Pidfiles
+//!
+//! It would be natural to assume that the process who claimed the pidfile
+//! should remove it upon exit to avoid leaving a stale pidfile in place.
+//! However, we already have a reliable way to detect staleness of the pidfile,
+//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
+//!
+//! And further, removing pidfiles would introduce a **catastrophic race condition**
+//! where two processes are running that are supposed to be singletons.
+//! Suppose we were to remove our pidfile during process shutdown.
+//! Here is how the race plays out:
+//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
+//! - Process `A` starts to shut down.
+//! - Process `B` is just starting up
+//!     - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
+//!     - It blocks on `flock`
+//! - Process `A` removes the pidfile as the last step of its shutdown procedure
+//!     - `unlink("myservice.pid")
+//! - Process `A` exits
+//!     - This releases its `flock` and unblocks `B`
+//! - Process `B` still has the file descriptor for `myservice.pid` open
+//! - Process `B` writes its PID into `myservice.pid`.
+//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
+//!   in the directory.
+//! - Process `C` starts
+//!     - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
+//!     - It `flock`s the file, which, since it's a different file, does not block
+//!     - It writes its PID into the file
+//!
+//! At this point, `B` and `C` are running, which is hazardous.
+//! Morale of the story: don't unlink pidfiles, ever.
+
+use std::{ops::Deref, path::Path};
+
+use anyhow::Context;
+use nix::unistd::Pid;
+
+use crate::lock_file::{self, LockFileRead};
+
+/// Keeps a claim on a pidfile alive until it is dropped.
+/// Returned by [`claim_for_current_process`].
+#[must_use]
+pub struct PidFileGuard(lock_file::LockFileGuard);
+
+impl Deref for PidFileGuard {
+    type Target = lock_file::LockFileGuard;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// Try to claim `path` as a pidfile for the current process.
+///
+/// If another process has already claimed the pidfile, and it is still running,
+/// this function returns ane error.
+/// Otherwise, the function `flock`s the file and updates its contents to the
+/// current process's PID.
+/// If the update fails, the flock is released and an error returned.
+/// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
+///
+/// ### Maintaining A Claim
+///
+/// It is the caller's responsibility to maintain the claim.
+/// The claim ends as soon as the returned guard object is dropped.
+/// To maintain the claim for the remaining lifetime of the current process,
+/// use [`std::mem::forget`] or similar.
+pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
+    let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
+    // if any of the next steps fail, we drop the file descriptor and thereby release the lock
+    let guard = unwritten_lock_file
+        .write_content(Pid::this().to_string())
+        .context("write pid to lock file")?;
+    Ok(PidFileGuard(guard))
+}
+
+/// Returned by [`read`].
+pub enum PidFileRead {
+    /// No file exists at the given path.
+    NotExist,
+    /// The given pidfile is currently not claimed by any process.
+    /// To determine this, the [`read`] operation acquired
+    /// an exclusive flock on the file. The lock is still held and responsibility
+    /// to release it is returned through the guard object.
+    /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
+    /// will fail.
+    ///
+    /// ### Caveats
+    ///
+    /// Do not unlink the pidfile from the filesystem. See module-comment for why.
+    NotHeldByAnyProcess(PidFileGuard),
+    /// The given pidfile is still claimed by another process whose PID is given
+    /// as part of this variant.
+    ///
+    /// ### Caveats
+    ///
+    /// 1. The other process might exit at any time, turning the given PID stale.
+    /// 2. There is a small window in which `claim_for_current_process` has already
+    ///    locked the file but not yet updates its contents. [`read`] will return
+    ///    this variant here, but with the old file contents, i.e., a stale PID.
+    ///
+    /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
+    /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
+    /// system call on it, bears the risk of killing an unrelated process.
+    /// This is an inherent limitation of using pidfiles.
+    /// The only race-free solution is to have a supervisor-process with a lifetime
+    /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
+    LockedByOtherProcess(Pid),
+}
+
+/// Try to read the file at the given path as a pidfile that was previously created
+/// through [`claim_for_current_process`].
+///
+/// On success, this function returns a [`PidFileRead`].
+/// Check its docs for a description of the meaning of its different variants.
+pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
+    let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
+    let ret = match res {
+        LockFileRead::NotExist => PidFileRead::NotExist,
+        LockFileRead::NotHeldByAnyProcess(guard, _) => {
+            PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
+        }
+        LockFileRead::LockedByOtherProcess {
+            not_locked_file: _not_locked_file,
+            content,
+        } => {
+            // XXX the read races with the write in claim_pid_file_for_pid().
+            // But pids are smaller than a page, so the kernel page cache will lock for us.
+            // The only problem is that we might get the old contents here.
+            // Can only fix that by implementing some scheme that downgrades the
+            // exclusive lock to shared lock in claim_pid_file_for_pid().
+            PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
+        }
+    };
+    Ok(ret)
+}
+
+fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
+    let pid: i32 = content
+        .parse()
+        .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
+    if pid < 1 {
+        anyhow::bail!("bad value in pidfile '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f55fe0886a..3995229e03 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,7 +7,6 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
-use nix::unistd::Pid;
 use tracing::*;
 
 use metrics::set_build_info_metric;
@@ -23,7 +22,7 @@ use pageserver::{
 use remote_storage::GenericRemoteStorage;
 use utils::{
     auth::JwtAuth,
-    lock_file, logging,
+    logging,
     postgres_backend::AuthType,
     project_git_version,
     sentry_init::{init_sentry, release_name},
@@ -220,28 +219,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     }
 
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
+    let lock_file =
+        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("Claimed pid file at {lock_file_path:?}");
+
     // ensure that the lock file is held even if the main thread of the process is panics
     // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    std::mem::forget(lock_file);
 
     // TODO: Check that it looks like a valid repository before going further
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 8a2894b32d..fcd3065c65 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -4,7 +4,6 @@
 use anyhow::{bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, Command};
 use const_format::formatcp;
-use nix::unistd::Pid;
 use remote_storage::RemoteStorageConfig;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
@@ -15,7 +14,7 @@ use tokio::sync::mpsc;
 use toml_edit::Document;
 use tracing::*;
 use url::{ParseError, Url};
-use utils::lock_file;
+use utils::pid_file;
 
 use metrics::set_build_info_metric;
 use safekeeper::broker;
@@ -147,28 +146,13 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
 
     // Prevent running multiple safekeepers on the same directory
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; safekeeper is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("Claimed pid file at {lock_file_path:?}");
+
     // ensure that the lock file is held even if the main thread of the process is panics
     // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    std::mem::forget(lock_file);
 
     // Set or read our ID.
     set_id(&mut conf, given_id)?;

From b50e0793cf482e47f28f4d34b9c959885f0a167d Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 7 Dec 2022 23:11:02 +0200
Subject: [PATCH 028/167] Rework remote_storage interface (#2993)

Changes:

* Remove `RemoteObjectId` concept from remote_storage.
Operate directly on /-separated names instead.
These names are now represented by struct `RemotePath` which was renamed from struct `RelativePath`

* Require remote storage to operate on relative paths for its contents, thus simplifying the way to derive them in pageserver and safekeeper

* Make `IndexPart` to use `String` instead of `RelativePath` for its entries, since those are just the layer names
---
 libs/remote_storage/src/lib.rs           | 159 +++------
 libs/remote_storage/src/local_fs.rs      | 431 ++++++-----------------
 libs/remote_storage/src/s3_bucket.rs     | 286 +++------------
 pageserver/src/bin/pageserver.rs         |   4 +-
 pageserver/src/config.rs                 |  24 +-
 pageserver/src/storage_sync2.rs          | 106 +++---
 pageserver/src/storage_sync2/delete.rs   |  30 +-
 pageserver/src/storage_sync2/download.rs |  62 +---
 pageserver/src/storage_sync2/index.rs    | 113 +++---
 pageserver/src/storage_sync2/upload.rs   |  37 +-
 pageserver/src/tenant/timeline.rs        | 186 +++++-----
 safekeeper/src/send_wal.rs               |   1 +
 safekeeper/src/wal_backup.rs             |  60 ++--
 safekeeper/src/wal_storage.rs            |  16 +-
 14 files changed, 492 insertions(+), 1023 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 0218fb464d..f72689884e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,7 +10,7 @@ mod s3_bucket;
 
 use std::{
     collections::HashMap,
-    fmt::{Debug, Display},
+    fmt::Debug,
     num::{NonZeroU32, NonZeroUsize},
     ops::Deref,
     path::{Path, PathBuf},
@@ -41,44 +41,27 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 
-#[derive(Clone, PartialEq, Eq)]
-pub struct RemoteObjectId(String);
+/// Path on the remote storage, relative to some inner prefix.
+/// The prefix is an implementation detail, that allows representing local paths
+/// as the remote ones, stripping the local storage prefix away.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct RemotePath(PathBuf);
+
+impl RemotePath {
+    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
+        anyhow::ensure!(
+            relative_path.is_relative(),
+            "Path {relative_path:?} is not relative"
+        );
+        Ok(Self(relative_path.to_path_buf()))
+    }
+
+    pub fn with_base(&self, base_path: &Path) -> PathBuf {
+        base_path.join(&self.0)
+    }
 
-///
-/// A key that refers to an object in remote storage. It works much like a Path,
-/// but it's a separate datatype so that you don't accidentally mix local paths
-/// and remote keys.
-///
-impl RemoteObjectId {
-    // Needed to retrieve last component for RemoteObjectId.
-    // In other words a file name
-    /// Turn a/b/c or a/b/c/ into c
     pub fn object_name(&self) -> Option<&str> {
-        // corner case, char::to_string is not const, thats why this is more verbose than it needs to be
-        // see https://github.com/rust-lang/rust/issues/88674
-        if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR {
-            return None;
-        }
-
-        if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-            self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1)
-        } else {
-            self.0
-                .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                .map(|(_, last)| last)
-        }
-    }
-}
-
-impl Debug for RemoteObjectId {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        Debug::fmt(&self.0, fmt)
-    }
-}
-
-impl Display for RemoteObjectId {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        Display::fmt(&self.0, fmt)
+        self.0.file_name().and_then(|os_str| os_str.to_str())
     }
 }
 
@@ -87,49 +70,40 @@ impl Display for RemoteObjectId {
 /// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Attempts to derive the storage path out of the local path, if the latter is correct.
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId>;
-
-    /// Gets the download path of the given storage file.
-    fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf>;
-
     /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>>;
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
 
     /// Lists all top level subdirectories for a given prefix
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
     /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
     /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemoteObjectId>,
-    ) -> anyhow::Result<Vec<RemoteObjectId>>;
+    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
 
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
         &self,
-        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
         // S3 PUT request requires the content length to be specified,
         // otherwise it starts to fail with the concurrent connection count increasing.
-        from_size_bytes: usize,
-        to: &RemoteObjectId,
+        data_size_bytes: usize,
+        to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()>;
 
     /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
     /// Returns the metadata, if any was stored with the file previously.
-    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError>;
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
 
     /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
     /// Returns the metadata, if any was stored with the file previously.
     async fn download_byte_range(
         &self,
-        from: &RemoteObjectId,
+        from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
     ) -> Result<Download, DownloadError>;
 
-    async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>;
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
 
     /// Downcast to LocalFs implementation. For tests.
     fn as_local(&self) -> Option<&LocalFs> {
@@ -196,18 +170,17 @@ impl Deref for GenericRemoteStorage {
 
 impl GenericRemoteStorage {
     pub fn from_config(
-        working_directory: PathBuf,
         storage_config: &RemoteStorageConfig,
     ) -> anyhow::Result<GenericRemoteStorage> {
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs(root) => {
                 info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone(), working_directory)?)
+                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config, working_directory)?))
+                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
             }
         })
     }
@@ -221,23 +194,12 @@ impl GenericRemoteStorage {
         &self,
         from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
         from_size_bytes: usize,
-        from_path: &Path,
+        to: &RemotePath,
     ) -> anyhow::Result<()> {
-        let target_storage_path = self.remote_object_id(from_path).with_context(|| {
-            format!(
-                "Failed to get the storage path for source local path '{}'",
-                from_path.display()
-            )
-        })?;
-
-        self.upload(from, from_size_bytes, &target_storage_path, None)
+        self.upload(from, from_size_bytes, to, None)
             .await
             .with_context(|| {
-                format!(
-                    "Failed to upload from '{}' to storage path '{:?}'",
-                    from_path.display(),
-                    target_storage_path
-                )
+                format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
             })
     }
 
@@ -246,24 +208,11 @@ impl GenericRemoteStorage {
     pub async fn download_storage_object(
         &self,
         byte_range: Option<(u64, Option<u64>)>,
-        to_path: &Path,
+        from: &RemotePath,
     ) -> Result<Download, DownloadError> {
-        let remote_object_path = self
-            .remote_object_id(to_path)
-            .with_context(|| {
-                format!(
-                    "Failed to get the storage path for target local path '{}'",
-                    to_path.display()
-                )
-            })
-            .map_err(DownloadError::BadInput)?;
-
         match byte_range {
-            Some((start, end)) => {
-                self.download_byte_range(&remote_object_path, start, end)
-                    .await
-            }
-            None => self.download(&remote_object_path).await,
+            Some((start, end)) => self.download_byte_range(from, start, end).await,
+            None => self.download(from).await,
         }
     }
 }
@@ -273,23 +222,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);
 
-fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
-    if prefix == path {
-        anyhow::bail!(
-            "Prefix and the path are equal, cannot strip: '{}'",
-            prefix.display()
-        )
-    } else {
-        path.strip_prefix(prefix).with_context(|| {
-            format!(
-                "Path '{}' is not prefixed with '{}'",
-                path.display(),
-                prefix.display(),
-            )
-        })
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
@@ -433,21 +365,24 @@ mod tests {
     use super::*;
 
     #[test]
-    fn object_name() {
-        let k = RemoteObjectId("a/b/c".to_owned());
+    fn test_object_name() {
+        let k = RemotePath::new(Path::new("a/b/c")).unwrap();
         assert_eq!(k.object_name(), Some("c"));
 
-        let k = RemoteObjectId("a/b/c/".to_owned());
+        let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
         assert_eq!(k.object_name(), Some("c"));
 
-        let k = RemoteObjectId("a/".to_owned());
+        let k = RemotePath::new(Path::new("a/")).unwrap();
         assert_eq!(k.object_name(), Some("a"));
 
         // XXX is it impossible to have an empty key?
-        let k = RemoteObjectId("".to_owned());
-        assert_eq!(k.object_name(), None);
-
-        let k = RemoteObjectId("/".to_owned());
+        let k = RemotePath::new(Path::new("")).unwrap();
         assert_eq!(k.object_name(), None);
     }
+
+    #[test]
+    fn rempte_path_cannot_be_created_from_absolute_ones() {
+        let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
+        assert_eq!(err.to_string(), "Path \"/\" is not relative");
+    }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 363d47f38d..3e2bded203 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,6 +5,7 @@
 //! volume is mounted to the local FS.
 
 use std::{
+    borrow::Cow,
     future::Future,
     path::{Path, PathBuf},
     pin::Pin,
@@ -18,61 +19,33 @@ use tokio::{
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 
-use crate::{Download, DownloadError, RemoteObjectId};
+use crate::{Download, DownloadError, RemotePath};
 
-use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
+use super::{RemoteStorage, StorageMetadata};
 
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 
-/// Convert a Path in the remote storage into a RemoteObjectId
-fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
-    Ok(RemoteObjectId(
-        path.to_str()
-            .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))?
-            .to_string(),
-    ))
-}
-
 #[derive(Debug, Clone)]
 pub struct LocalFs {
-    working_directory: PathBuf,
     storage_root: PathBuf,
 }
 
 impl LocalFs {
     /// Attempts to create local FS storage, along with its root directory.
-    pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
-        if !root.exists() {
-            std::fs::create_dir_all(&root).with_context(|| {
-                format!(
-                    "Failed to create all directories in the given root path '{}'",
-                    root.display(),
-                )
+    /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
+    pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
+        if !storage_root.exists() {
+            std::fs::create_dir_all(&storage_root).with_context(|| {
+                format!("Failed to create all directories in the given root path {storage_root:?}")
             })?;
         }
-        Ok(Self {
-            working_directory,
-            storage_root: root,
-        })
-    }
-
-    ///
-    /// Get the absolute path in the local filesystem to given remote object.
-    ///
-    /// This is public so that it can be used in tests. Should not be used elsewhere.
-    ///
-    pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
-        let path = PathBuf::from(&remote_object_id.0);
-        if path.is_relative() {
-            Ok(self.storage_root.join(path))
-        } else if path.starts_with(&self.storage_root) {
-            Ok(path)
-        } else {
-            bail!(
-                "Path '{}' does not belong to the current storage",
-                path.display()
-            )
+        if !storage_root.is_absolute() {
+            storage_root = storage_root.canonicalize().with_context(|| {
+                format!("Failed to represent path {storage_root:?} as an absolute path")
+            })?;
         }
+
+        Ok(Self { storage_root })
     }
 
     async fn read_storage_metadata(
@@ -104,45 +77,48 @@ impl LocalFs {
 
 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
-    /// Convert a "local" path into a "remote path"
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
-        let path = self.storage_root.join(
-            strip_path_prefix(&self.working_directory, local_path)
-                .context("local path does not belong to this storage")?,
-        );
-        remote_object_id_from_path(&path)
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+        Ok(get_all_files(&self.storage_root, true)
+            .await?
+            .into_iter()
+            .map(|path| {
+                path.strip_prefix(&self.storage_root)
+                    .context("Failed to strip storage root prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    )
+            })
+            .collect())
     }
 
-    fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
-        let storage_path = PathBuf::from(&remote_object_id.0);
-        let relative_path = strip_path_prefix(&self.storage_root, &storage_path)
-            .context("local path does not belong to this storage")?;
-        Ok(self.working_directory.join(relative_path))
-    }
-
-    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
-        get_all_files(&self.storage_root, true).await
-    }
-
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemoteObjectId>,
-    ) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
         let path = match prefix {
-            Some(prefix) => Path::new(&prefix.0),
-            None => &self.storage_root,
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
         };
-        get_all_files(path, false).await
+        Ok(get_all_files(path.as_ref(), false)
+            .await?
+            .into_iter()
+            .map(|path| {
+                path.strip_prefix(&self.storage_root)
+                    .context("Failed to strip preifix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    )
+            })
+            .collect())
     }
 
     async fn upload(
         &self,
-        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
-        from_size_bytes: usize,
-        to: &RemoteObjectId,
+        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data_size_bytes: usize,
+        to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
-        let target_file_path = self.resolve_in_storage(to)?;
+        let target_file_path = to.with_base(&self.storage_root);
         create_target_directory(&target_file_path).await?;
         // We need this dance with sort of durable rename (without fsyncs)
         // to prevent partial uploads. This was really hit when pageserver shutdown
@@ -163,8 +139,8 @@ impl RemoteStorage for LocalFs {
                 })?,
         );
 
-        let from_size_bytes = from_size_bytes as u64;
-        let mut buffer_to_read = from.take(from_size_bytes);
+        let from_size_bytes = data_size_bytes as u64;
+        let mut buffer_to_read = data.take(from_size_bytes);
 
         let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
             .await
@@ -221,27 +197,22 @@ impl RemoteStorage for LocalFs {
         Ok(())
     }
 
-    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
-        let file_path = self
-            .resolve_in_storage(from)
-            .map_err(DownloadError::BadInput)?;
-        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
             let source = io::BufReader::new(
                 fs::OpenOptions::new()
                     .read(true)
-                    .open(&file_path)
+                    .open(&target_path)
                     .await
                     .with_context(|| {
-                        format!(
-                            "Failed to open source file '{}' to use in the download",
-                            file_path.display()
-                        )
+                        format!("Failed to open source file {target_path:?} to use in the download")
                     })
                     .map_err(DownloadError::Other)?,
             );
 
             let metadata = self
-                .read_storage_metadata(&file_path)
+                .read_storage_metadata(&target_path)
                 .await
                 .map_err(DownloadError::Other)?;
             Ok(Download {
@@ -255,7 +226,7 @@ impl RemoteStorage for LocalFs {
 
     async fn download_byte_range(
         &self,
-        from: &RemoteObjectId,
+        from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
     ) -> Result<Download, DownloadError> {
@@ -267,20 +238,15 @@ impl RemoteStorage for LocalFs {
                 return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
             }
         }
-        let file_path = self
-            .resolve_in_storage(from)
-            .map_err(DownloadError::BadInput)?;
-        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
+        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
             let mut source = io::BufReader::new(
                 fs::OpenOptions::new()
                     .read(true)
-                    .open(&file_path)
+                    .open(&target_path)
                     .await
                     .with_context(|| {
-                        format!(
-                            "Failed to open source file '{}' to use in the download",
-                            file_path.display()
-                        )
+                        format!("Failed to open source file {target_path:?} to use in the download")
                     })
                     .map_err(DownloadError::Other)?,
             );
@@ -290,7 +256,7 @@ impl RemoteStorage for LocalFs {
                 .context("Failed to seek to the range start in a local storage file")
                 .map_err(DownloadError::Other)?;
             let metadata = self
-                .read_storage_metadata(&file_path)
+                .read_storage_metadata(&target_path)
                 .await
                 .map_err(DownloadError::Other)?;
 
@@ -309,15 +275,12 @@ impl RemoteStorage for LocalFs {
         }
     }
 
-    async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> {
-        let file_path = self.resolve_in_storage(path)?;
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        let file_path = path.with_base(&self.storage_root);
         if file_path.exists() && file_path.is_file() {
             Ok(fs::remove_file(file_path).await?)
         } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
+            bail!("File {file_path:?} either does not exist or is not a file")
         }
     }
 
@@ -333,7 +296,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
 fn get_all_files<'a, P>(
     directory_path: P,
     recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<RemoteObjectId>>> + Send + Sync + 'a>>
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
 where
     P: AsRef<Path> + Send + Sync + 'a,
 {
@@ -347,20 +310,20 @@ where
                     let file_type = dir_entry.file_type().await?;
                     let entry_path = dir_entry.path();
                     if file_type.is_symlink() {
-                        debug!("{:?} us a symlink, skipping", entry_path)
+                        debug!("{entry_path:?} us a symlink, skipping")
                     } else if file_type.is_dir() {
                         if recursive {
                             paths.extend(get_all_files(&entry_path, true).await?.into_iter())
                         } else {
-                            paths.push(remote_object_id_from_path(&dir_entry.path())?)
+                            paths.push(entry_path)
                         }
                     } else {
-                        paths.push(remote_object_id_from_path(&dir_entry.path())?);
+                        paths.push(entry_path);
                     }
                 }
                 Ok(paths)
             } else {
-                bail!("Path '{}' is not a directory", directory_path.display())
+                bail!("Path {directory_path:?} is not a directory")
             }
         } else {
             Ok(Vec::new())
@@ -395,173 +358,6 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
     }
 }
 
-#[cfg(test)]
-mod pure_tests {
-    use tempfile::tempdir;
-
-    use super::*;
-
-    #[test]
-    fn storage_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: workdir.clone(),
-            storage_root: storage_root.clone(),
-        };
-
-        let local_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("file_name");
-        let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);
-
-        let actual_path = PathBuf::from(
-            storage
-                .remote_object_id(&local_path)
-                .expect("Matching path should map to storage path normally")
-                .0,
-        );
-        assert_eq!(
-            expected_path,
-            actual_path,
-            "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
-            match storage.remote_object_id(mismatching_path) {
-                Ok(wrong_path) => panic!(
-                    "Expected path '{}' to error, but got storage path: {:?}",
-                    mismatching_path.display(),
-                    wrong_path,
-                ),
-                Err(e) => format!("{:?}", e),
-            }
-        }
-
-        let workdir = tempdir()?.path().to_owned();
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: workdir.clone(),
-            storage_root,
-        };
-
-        let error_string = storage_path_error(&storage, &workdir);
-        assert!(error_string.contains("does not belong to this storage"));
-        assert!(error_string.contains(workdir.to_str().unwrap()));
-
-        let mismatching_path_str = "/something/else";
-        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
-        assert!(
-            error_message.contains(mismatching_path_str),
-            "Error should mention wrong path"
-        );
-        assert!(
-            error_message.contains(workdir.to_str().unwrap()),
-            "Error should mention server workdir"
-        );
-        assert!(error_message.contains("does not belong to this storage"));
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: workdir.clone(),
-            storage_root: storage_root.clone(),
-        };
-
-        let name = "not a metadata";
-        let local_path = workdir.join("timelines").join("some_timeline").join(name);
-        assert_eq!(
-            local_path,
-            storage
-                .local_path(&remote_object_id_from_path(
-                    &storage_root.join(local_path.strip_prefix(&workdir)?)
-                )?)
-                .expect("For a valid input, valid local path should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote delta file"
-        );
-
-        let local_metadata_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("metadata");
-        let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
-        assert_eq!(
-            local_metadata_path,
-            storage
-                .local_path(&remote_metadata_path)
-                .expect("For a valid input, valid local path should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote metadata file"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String {
-            match storage.local_path(storage_path) {
-                Ok(wrong_path) => panic!(
-                    "Expected local path input {:?} to cause an error, but got file path: {:?}",
-                    storage_path, wrong_path,
-                ),
-                Err(e) => format!("{:?}", e),
-            }
-        }
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: tempdir()?.path().to_owned(),
-            storage_root,
-        };
-
-        let totally_wrong_path = "wrong_wrong_wrong";
-        let error_message =
-            local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string()));
-        assert!(error_message.contains(totally_wrong_path));
-
-        Ok(())
-    }
-
-    #[test]
-    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let original_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("some name");
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let dummy_storage = LocalFs {
-            working_directory: workdir,
-            storage_root,
-        };
-
-        let storage_path = dummy_storage.remote_object_id(&original_path)?;
-        let download_destination = dummy_storage.local_path(&storage_path)?;
-
-        assert_eq!(
-            original_path, download_destination,
-            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
-        );
-
-        Ok(())
-    }
-}
-
 #[cfg(test)]
 mod fs_tests {
     use super::*;
@@ -573,7 +369,7 @@ mod fs_tests {
         storage: &LocalFs,
         #[allow(clippy::ptr_arg)]
         // have to use &PathBuf due to `storage.local_path` parameter requirements
-        remote_storage_path: &RemoteObjectId,
+        remote_storage_path: &RemotePath,
         expected_metadata: Option<&StorageMetadata>,
     ) -> anyhow::Result<String> {
         let mut download = storage
@@ -596,41 +392,16 @@ mod fs_tests {
 
     #[tokio::test]
     async fn upload_file() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
         let storage = create_storage()?;
 
-        let (file, size) = create_file_for_upload(
-            &storage.working_directory.join("whatever"),
-            "whatever_contents",
-        )
-        .await?;
-        let target_path = "/somewhere/else";
-        match storage
-            .upload(
-                Box::new(file),
-                size,
-                &RemoteObjectId(target_path.to_string()),
-                None,
-            )
-            .await
-        {
-            Ok(()) => panic!("Should not allow storing files with wrong target path"),
-            Err(e) => {
-                let message = format!("{:?}", e);
-                assert!(message.contains(target_path));
-                assert!(message.contains("does not belong to the current storage"));
-            }
-        }
-        assert!(storage.list().await?.is_empty());
-
-        let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
         assert_eq!(
             storage.list().await?,
             vec![target_path_1.clone()],
             "Should list a single file after first upload"
         );
 
-        let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
         assert_eq!(
             list_files_sorted(&storage).await?,
             vec![target_path_1.clone(), target_path_2.clone()],
@@ -644,7 +415,7 @@ mod fs_tests {
     async fn upload_file_negatives() -> anyhow::Result<()> {
         let storage = create_storage()?;
 
-        let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
+        let id = RemotePath::new(Path::new("dummy"))?;
         let content = std::io::Cursor::new(b"12345");
 
         // Check that you get an error if the size parameter doesn't match the actual
@@ -669,16 +440,14 @@ mod fs_tests {
     }
 
     fn create_storage() -> anyhow::Result<LocalFs> {
-        LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
+        LocalFs::new(tempdir()?.path().to_owned())
     }
 
     #[tokio::test]
     async fn download_file() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
         let storage = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
         let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
         assert_eq!(
@@ -688,7 +457,7 @@ mod fs_tests {
         );
 
         let non_existing_path = "somewhere/else";
-        match storage.download(&RemoteObjectId(non_existing_path.to_string())).await {
+        match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
             Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
             other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
         }
@@ -697,11 +466,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn download_file_range_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
         let storage = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
         let full_range_download_contents =
             read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
@@ -767,11 +534,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn download_file_range_negative() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
         let storage = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
         let start = 1_000_000_000;
         let end = start + 1;
@@ -813,11 +578,9 @@ mod fs_tests {
 
     #[tokio::test]
     async fn delete_file() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
         let storage = create_storage()?;
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
 
         storage.delete(&upload_target).await?;
         assert!(storage.list().await?.is_empty());
@@ -827,7 +590,8 @@ mod fs_tests {
             Err(e) => {
                 let error_string = e.to_string();
                 assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&upload_target.0));
+                let expected_path = upload_target.with_base(&storage.storage_root);
+                assert!(error_string.contains(expected_path.to_str().unwrap()));
             }
         }
         Ok(())
@@ -835,8 +599,6 @@ mod fs_tests {
 
     #[tokio::test]
     async fn file_with_metadata() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
         let storage = create_storage()?;
         let upload_name = "upload_1";
         let metadata = StorageMetadata(HashMap::from([
@@ -844,7 +606,7 @@ mod fs_tests {
             ("two".to_string(), "2".to_string()),
         ]));
         let upload_target =
-            upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
 
         let full_range_download_contents =
             read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
@@ -884,23 +646,32 @@ mod fs_tests {
     }
 
     async fn upload_dummy_file(
-        workdir: &Path,
         storage: &LocalFs,
         name: &str,
         metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<RemoteObjectId> {
-        let timeline_path = workdir.join("timelines").join("some_timeline");
-        let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
-        let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
-        let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string());
-
-        let from_path = storage.working_directory.join(name);
+    ) -> anyhow::Result<RemotePath> {
+        let from_path = storage
+            .storage_root
+            .join("timelines")
+            .join("some_timeline")
+            .join(name);
         let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
 
+        let relative_path = from_path
+            .strip_prefix(&storage.storage_root)
+            .context("Failed to strip storage root prefix")
+            .and_then(RemotePath::new)
+            .with_context(|| {
+                format!(
+                    "Failed to resolve remote part of path {:?} for base {:?}",
+                    from_path, storage.storage_root
+                )
+            })?;
+
         storage
-            .upload(Box::new(file), size, &remote_object_id, metadata)
+            .upload(Box::new(file), size, &relative_path, metadata)
             .await?;
-        remote_object_id_from_path(&storage_path)
+        Ok(relative_path)
     }
 
     async fn create_file_for_upload(
@@ -925,7 +696,7 @@ mod fs_tests {
         format!("contents for {name}")
     }
 
-    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
         let mut files = storage.list().await?;
         files.sort_by(|a, b| a.0.cmp(&b.0));
         Ok(files)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c721560c29..ab1e5da6c5 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -5,7 +5,6 @@
 //! their bucket prefixes are both specified and different.
 
 use std::env::var;
-use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -29,8 +28,7 @@ use tracing::debug;
 
 use super::StorageMetadata;
 use crate::{
-    strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
@@ -100,31 +98,8 @@ pub(super) mod metrics {
     }
 }
 
-fn download_destination(
-    id: &RemoteObjectId,
-    workdir: &Path,
-    prefix_to_strip: Option<&str>,
-) -> PathBuf {
-    let path_without_prefix = match prefix_to_strip {
-        Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| {
-            panic!(
-                "Could not strip prefix '{}' from S3 object key '{}'",
-                prefix, id.0
-            )
-        }),
-        None => &id.0,
-    };
-
-    workdir.join(
-        path_without_prefix
-            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .collect::<PathBuf>(),
-    )
-}
-
 /// AWS S3 storage.
 pub struct S3Bucket {
-    workdir: PathBuf,
     client: Client,
     bucket_name: String,
     prefix_in_bucket: Option<String>,
@@ -142,7 +117,7 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
         debug!(
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
@@ -196,13 +171,39 @@ impl S3Bucket {
         });
         Ok(Self {
             client,
-            workdir,
             bucket_name: aws_config.bucket_name.clone(),
             prefix_in_bucket,
             concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
         })
     }
 
+    fn s3_object_to_relative_path(&self, key: &str) -> RemotePath {
+        let relative_path =
+            match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) {
+                Some(stripped) => stripped,
+                // we rely on AWS to return properly prefixed paths
+                // for requests with a certain prefix
+                None => panic!(
+                    "Key {} does not start with bucket prefix {:?}",
+                    key, self.prefix_in_bucket
+                ),
+            };
+        RemotePath(
+            relative_path
+                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                .collect(),
+        )
+    }
+
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
+        }
+        full_path
+    }
+
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
         let _guard = self
             .concurrency_limiter
@@ -252,25 +253,7 @@ impl S3Bucket {
 
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
-        let relative_path = strip_path_prefix(&self.workdir, local_path)?;
-        let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in relative_path {
-            key.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            key.push_str(&segment.to_string_lossy());
-        }
-        Ok(RemoteObjectId(key))
-    }
-
-    fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result<PathBuf> {
-        Ok(download_destination(
-            storage_path,
-            &self.workdir,
-            self.prefix_in_bucket.as_deref(),
-        ))
-    }
-
-    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
         let mut document_keys = Vec::new();
 
         let mut continuation_token = None;
@@ -300,7 +283,7 @@ impl RemoteStorage for S3Bucket {
                     .contents
                     .unwrap_or_default()
                     .into_iter()
-                    .filter_map(|o| Some(RemoteObjectId(o.key?))),
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
             );
 
             match fetch_response.continuation_token {
@@ -314,13 +297,10 @@ impl RemoteStorage for S3Bucket {
 
     /// See the doc for `RemoteStorage::list_prefixes`
     /// Note: it wont include empty "directories"
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemoteObjectId>,
-    ) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
-            .map(|p| p.0.clone())
+            .map(|p| self.relative_path_to_s3_object(p))
             .or_else(|| self.prefix_in_bucket.clone())
             .map(|mut p| {
                 // required to end with a separator
@@ -362,7 +342,7 @@ impl RemoteStorage for S3Bucket {
                     .common_prefixes
                     .unwrap_or_default()
                     .into_iter()
-                    .filter_map(|o| Some(RemoteObjectId(o.prefix?))),
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
             );
 
             match fetch_response.continuation_token {
@@ -378,7 +358,7 @@ impl RemoteStorage for S3Bucket {
         &self,
         from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
         from_size_bytes: usize,
-        to: &RemoteObjectId,
+        to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
         let _guard = self
@@ -395,7 +375,7 @@ impl RemoteStorage for S3Bucket {
         self.client
             .put_object()
             .bucket(self.bucket_name.clone())
-            .key(to.0.to_owned())
+            .key(self.relative_path_to_s3_object(to))
             .set_metadata(metadata.map(|m| m.0))
             .content_length(from_size_bytes.try_into()?)
             .body(bytes_stream)
@@ -408,10 +388,10 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 
-    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
         self.download_object(GetObjectRequest {
             bucket: self.bucket_name.clone(),
-            key: from.0.to_owned(),
+            key: self.relative_path_to_s3_object(from),
             ..GetObjectRequest::default()
         })
         .await
@@ -419,7 +399,7 @@ impl RemoteStorage for S3Bucket {
 
     async fn download_byte_range(
         &self,
-        from: &RemoteObjectId,
+        from: &RemotePath,
         start_inclusive: u64,
         end_exclusive: Option<u64>,
     ) -> Result<Download, DownloadError> {
@@ -427,19 +407,19 @@ impl RemoteStorage for S3Bucket {
         // and needs both ends to be exclusive
         let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
         let range = Some(match end_inclusive {
-            Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
-            None => format!("bytes={}-", start_inclusive),
+            Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
+            None => format!("bytes={start_inclusive}-"),
         });
 
         self.download_object(GetObjectRequest {
             bucket: self.bucket_name.clone(),
-            key: from.0.to_owned(),
+            key: self.relative_path_to_s3_object(from),
             range,
         })
         .await
     }
 
-    async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
         let _guard = self
             .concurrency_limiter
             .acquire()
@@ -451,7 +431,7 @@ impl RemoteStorage for S3Bucket {
         self.client
             .delete_object()
             .bucket(self.bucket_name.clone())
-            .key(remote_object_id.0.to_owned())
+            .key(self.relative_path_to_s3_object(path))
             .send()
             .await
             .map_err(|e| {
@@ -461,181 +441,3 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use tempfile::tempdir;
-
-    use super::*;
-
-    #[test]
-    fn test_download_destination() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let local_path = workdir.join("one").join("two").join("test_name");
-        let relative_path = local_path.strip_prefix(&workdir)?;
-
-        let key = RemoteObjectId(format!(
-            "{}{}",
-            REMOTE_STORAGE_PREFIX_SEPARATOR,
-            relative_path
-                .iter()
-                .map(|segment| segment.to_str().unwrap())
-                .collect::<Vec<_>>()
-                .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
-        ));
-
-        assert_eq!(
-            local_path,
-            download_destination(&key, &workdir, None),
-            "Download destination should consist of s3 path joined with the workdir prefix"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
-        let segment_1 = "matching";
-        let segment_2 = "file";
-        let local_path = &workdir.join(segment_1).join(segment_2);
-
-        let storage = dummy_storage(workdir);
-
-        let expected_key = RemoteObjectId(format!(
-            "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}",
-            storage.prefix_in_bucket.as_deref().unwrap_or_default(),
-        ));
-
-        let actual_key = storage
-            .remote_object_id(local_path)
-            .expect("Matching path should map to S3 path normally");
-        assert_eq!(
-            expected_key,
-            actual_key,
-            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
-            match storage.remote_object_id(mismatching_path) {
-                Ok(wrong_key) => panic!(
-                    "Expected path '{}' to error, but got S3 key: {:?}",
-                    mismatching_path.display(),
-                    wrong_key,
-                ),
-                Err(e) => e.to_string(),
-            }
-        }
-
-        let workdir = tempdir()?.path().to_owned();
-        let storage = dummy_storage(workdir.clone());
-
-        let error_message = storage_path_error(&storage, &workdir);
-        assert!(
-            error_message.contains("Prefix and the path are equal"),
-            "Message '{}' does not contain the required string",
-            error_message
-        );
-
-        let mismatching_path = PathBuf::from("somewhere").join("else");
-        let error_message = storage_path_error(&storage, &mismatching_path);
-        assert!(
-            error_message.contains(mismatching_path.to_str().unwrap()),
-            "Error should mention wrong path"
-        );
-        assert!(
-            error_message.contains(workdir.to_str().unwrap()),
-            "Error should mention server workdir"
-        );
-        assert!(
-            error_message.contains("is not prefixed with"),
-            "Message '{}' does not contain a required string",
-            error_message
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let storage = dummy_storage(workdir.clone());
-        let timeline_dir = workdir.join("timelines").join("test_timeline");
-        let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;
-
-        let s3_key = create_s3_key(
-            &relative_timeline_path.join("not a metadata"),
-            storage.prefix_in_bucket.as_deref(),
-        );
-        assert_eq!(
-            download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
-            storage
-                .local_path(&s3_key)
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote delta file"
-        );
-
-        let s3_key = create_s3_key(
-            &relative_timeline_path.join("metadata"),
-            storage.prefix_in_bucket.as_deref(),
-        );
-        assert_eq!(
-            download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
-            storage
-                .local_path(&s3_key)
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote metadata file"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let original_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("some name");
-
-        let dummy_storage = dummy_storage(workdir);
-
-        let key = dummy_storage.remote_object_id(&original_path)?;
-        let download_destination = dummy_storage.local_path(&key)?;
-
-        assert_eq!(
-            original_path, download_destination,
-            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
-        );
-
-        Ok(())
-    }
-
-    fn dummy_storage(workdir: PathBuf) -> S3Bucket {
-        S3Bucket {
-            workdir,
-            client: Client::new(&aws_config::SdkConfig::builder().build()),
-            bucket_name: "dummy-bucket".to_string(),
-            prefix_in_bucket: Some("dummy_prefix/".to_string()),
-            concurrency_limiter: Semaphore::new(1),
-        }
-    }
-
-    fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId {
-        RemoteObjectId(relative_file_path.iter().fold(
-            prefix.unwrap_or_default().to_string(),
-            |mut path_string, segment| {
-                path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                path_string.push_str(segment.to_str().unwrap());
-                path_string
-            },
-        ))
-    }
-}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 3995229e03..d70b36616b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -280,9 +280,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let remote_storage = conf
         .remote_storage_config
         .as_ref()
-        .map(|storage_config| {
-            GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
-        })
+        .map(GenericRemoteStorage::from_config)
         .transpose()
         .context("Failed to init generic remote storage")?;
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4542afae33..86f1fcef94 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use remote_storage::RemoteStorageConfig;
+use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -454,6 +454,28 @@ impl PageServerConf {
             .join(METADATA_FILE_NAME)
     }
 
+    /// Files on the remote storage are stored with paths, relative to the workdir.
+    /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
+    ///
+    /// Errors if the path provided does not start from pageserver's workdir.
+    pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
+        local_path
+            .strip_prefix(&self.workdir)
+            .context("Failed to strip workdir prefix")
+            .and_then(RemotePath::new)
+            .with_context(|| {
+                format!(
+                    "Failed to resolve remote part of path {:?} for base {:?}",
+                    local_path, self.workdir
+                )
+            })
+    }
+
+    /// Turns storage remote path of a file into its local path.
+    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
+        remote_path.with_base(&self.workdir)
+    }
+
     //
     // Postgres distribution paths
     //
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 5b3225028f..b5c5a0d25d 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -202,7 +202,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
 use anyhow::ensure;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::runtime::Runtime;
 use tracing::{info, warn};
 use tracing::{info_span, Instrument};
@@ -217,7 +217,7 @@ use crate::metrics::RemoteOpKind;
 use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
 use crate::{
     config::PageServerConf,
-    storage_sync::index::{LayerFileMetadata, RemotePath},
+    storage_sync::index::LayerFileMetadata,
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
@@ -337,18 +337,18 @@ impl UploadQueue {
 
         let state = UploadQueueInitialized {
             // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: Default::default(),
+            latest_files: HashMap::new(),
             latest_metadata: metadata.clone(),
             // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
             // safekeepers from garbage-collecting anything.
             last_uploaded_consistent_lsn: Lsn(0),
             // what follows are boring default initializations
-            task_counter: Default::default(),
+            task_counter: 0,
             num_inprogress_layer_uploads: 0,
             num_inprogress_metadata_uploads: 0,
             num_inprogress_deletions: 0,
-            inprogress_tasks: Default::default(),
-            queued_operations: Default::default(),
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -357,6 +357,10 @@ impl UploadQueue {
 
     fn initialize_with_current_remote_index_part(
         &mut self,
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+
         index_part: &IndexPart,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
@@ -366,14 +370,19 @@ impl UploadQueue {
             }
         }
 
-        let mut files = HashMap::new();
-        for path in &index_part.timeline_layers {
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
+        for timeline_name in &index_part.timeline_layers {
+            let local_path = timeline_path.join(timeline_name);
+            let remote_timeline_path = conf.remote_path(&local_path).expect(
+                "Remote timeline path and local timeline path were constructed form the same conf",
+            );
             let layer_metadata = index_part
                 .layer_metadata
-                .get(path)
+                .get(timeline_name)
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(path.clone(), layer_metadata);
+            files.insert(remote_timeline_path, layer_metadata);
         }
 
         let index_part_metadata = index_part.parse_metadata()?;
@@ -391,8 +400,8 @@ impl UploadQueue {
             num_inprogress_layer_uploads: 0,
             num_inprogress_metadata_uploads: 0,
             num_inprogress_deletions: 0,
-            inprogress_tasks: Default::default(),
-            queued_operations: Default::default(),
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -456,7 +465,12 @@ impl RemoteTimelineClient {
     /// The given `index_part` must be the one on the remote.
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(
+            self.conf,
+            self.tenant_id,
+            self.timeline_id,
+            index_part,
+        )?;
         Ok(())
     }
 
@@ -510,15 +524,13 @@ impl RemoteTimelineClient {
     /// On success, returns the size of the downloaded file.
     pub async fn download_layer_file(
         &self,
-        path: &RemotePath,
+        remote_path: &RemotePath,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
         let downloaded_size = download::download_layer_file(
             self.conf,
             &self.storage_impl,
-            self.tenant_id,
-            self.timeline_id,
-            path,
+            remote_path,
             layer_metadata,
         )
         .measure_remote_op(
@@ -536,13 +548,13 @@ impl RemoteTimelineClient {
             let new_metadata = LayerFileMetadata::new(downloaded_size);
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
-            if let Some(upgraded) = upload_queue.latest_files.get_mut(path) {
+            if let Some(upgraded) = upload_queue.latest_files.get_mut(remote_path) {
                 upgraded.merge(&new_metadata);
             } else {
                 // The file should exist, since we just downloaded it.
                 warn!(
                     "downloaded file {:?} not found in local copy of the index file",
-                    path
+                    remote_path
                 );
             }
         }
@@ -612,14 +624,9 @@ impl RemoteTimelineClient {
             "file size not initialized in metadata"
         );
 
-        let relative_path = RemotePath::strip_base_path(
-            &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-            path,
-        )?;
-
         upload_queue
             .latest_files
-            .insert(relative_path, layer_metadata.clone());
+            .insert(self.conf.remote_path(path)?, layer_metadata.clone());
 
         let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
         self.update_upload_queue_unfinished_metric(1, &op);
@@ -641,13 +648,10 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        // Convert the paths into RelativePaths, and gather other information we need.
-        let mut relative_paths = Vec::with_capacity(paths.len());
+        // Convert the paths into RemotePaths, and gather other information we need.
+        let mut remote_paths = Vec::with_capacity(paths.len());
         for path in paths {
-            relative_paths.push(RemotePath::strip_base_path(
-                &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-                path,
-            )?);
+            remote_paths.push(self.conf.remote_path(path)?);
         }
 
         // Deleting layers doesn't affect the values stored in TimelineMetadata,
@@ -663,8 +667,8 @@ impl RemoteTimelineClient {
         // from latest_files, but not yet scheduled for deletion. Use a closure
         // to syntactically forbid ? or bail! calls here.
         let no_bail_here = || {
-            for relative_path in relative_paths {
-                upload_queue.latest_files.remove(&relative_path);
+            for remote_path in remote_paths {
+                upload_queue.latest_files.remove(&remote_path);
             }
 
             let index_part = IndexPart::new(
@@ -838,14 +842,19 @@ impl RemoteTimelineClient {
 
             let upload_result: anyhow::Result<()> = match &task.op {
                 UploadOp::UploadLayer(ref path, ref layer_metadata) => {
-                    upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata)
-                        .measure_remote_op(
-                            self.tenant_id,
-                            self.timeline_id,
-                            RemoteOpFileKind::Layer,
-                            RemoteOpKind::Upload,
-                        )
-                        .await
+                    upload::upload_timeline_layer(
+                        self.conf,
+                        &self.storage_impl,
+                        path,
+                        layer_metadata,
+                    )
+                    .measure_remote_op(
+                        self.tenant_id,
+                        self.timeline_id,
+                        RemoteOpFileKind::Layer,
+                        RemoteOpKind::Upload,
+                    )
+                    .await
                 }
                 UploadOp::UploadMetadata(ref index_part, _lsn) => {
                     upload::upload_index_part(
@@ -864,7 +873,7 @@ impl RemoteTimelineClient {
                     .await
                 }
                 UploadOp::Delete(metric_file_kind, ref path) => {
-                    delete::delete_layer(&self.storage_impl, path)
+                    delete::delete_layer(self.conf, &self.storage_impl, path)
                         .measure_remote_op(
                             self.tenant_id,
                             self.timeline_id,
@@ -1093,15 +1102,11 @@ mod tests {
         TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
     }
 
-    fn assert_file_list(a: &HashSet<RemotePath>, b: &[&str]) {
-        let xx = PathBuf::from("");
-        let mut avec: Vec<String> = a
-            .iter()
-            .map(|x| x.to_local_path(&xx).to_string_lossy().into())
-            .collect();
+    fn assert_file_list(a: &HashSet<String>, b: &[&str]) {
+        let mut avec: Vec<&str> = a.iter().map(|a| a.as_str()).collect();
         avec.sort();
 
-        let mut bvec = b.to_owned();
+        let mut bvec = b.to_vec();
         bvec.sort_unstable();
 
         assert_eq!(avec, bvec);
@@ -1169,8 +1174,7 @@ mod tests {
 
         println!("workdir: {}", harness.conf.workdir.display());
 
-        let storage_impl =
-            GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?;
+        let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
         let client = Arc::new(RemoteTimelineClient {
             conf: harness.conf,
             runtime,
diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/storage_sync2/delete.rs
index f22dbdc2d8..9f6732fbff 100644
--- a/pageserver/src/storage_sync2/delete.rs
+++ b/pageserver/src/storage_sync2/delete.rs
@@ -5,34 +5,24 @@ use tracing::debug;
 
 use remote_storage::GenericRemoteStorage;
 
-pub(super) async fn delete_layer(
-    storage: &GenericRemoteStorage,
-    local_layer_path: &Path,
+use crate::config::PageServerConf;
+
+pub(super) async fn delete_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    local_layer_path: &'a Path,
 ) -> anyhow::Result<()> {
     fail::fail_point!("before-delete-layer", |_| {
         anyhow::bail!("failpoint before-delete-layer")
     });
-    debug!(
-        "Deleting layer from remote storage: {:?}",
-        local_layer_path.display()
-    );
+    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
 
-    let storage_path = storage
-        .remote_object_id(local_layer_path)
-        .with_context(|| {
-            format!(
-                "Failed to get the layer storage path for local path '{}'",
-                local_layer_path.display()
-            )
-        })?;
+    let path_to_delete = conf.remote_path(local_layer_path)?;
 
     // XXX: If the deletion fails because the object already didn't exist,
     // it would be good to just issue a warning but consider it success.
     // https://github.com/neondatabase/neon/issues/2934
-    storage.delete(&storage_path).await.with_context(|| {
-        format!(
-            "Failed to delete remote layer from storage at '{:?}'",
-            storage_path
-        )
+    storage.delete(&path_to_delete).await.with_context(|| {
+        format!("Failed to delete remote layer from storage at {path_to_delete:?}")
     })
 }
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index d68455ea2b..18a6ac0179 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -10,12 +10,11 @@ use tracing::debug;
 
 use crate::config::PageServerConf;
 use crate::storage_sync::index::LayerFileMetadata;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
 use super::index::IndexPart;
-use super::RemotePath;
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -29,21 +28,10 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
 pub async fn download_layer_file<'a>(
     conf: &'static PageServerConf,
     storage: &'a GenericRemoteStorage,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    path: &'a RemotePath,
+    remote_path: &'a RemotePath,
     layer_metadata: &'a LayerFileMetadata,
 ) -> anyhow::Result<u64> {
-    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
-
-    let local_path = path.to_local_path(&timeline_path);
-
-    let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
-        format!(
-            "Failed to get the layer storage path for local path '{}'",
-            local_path.display()
-        )
-    })?;
+    let local_path = conf.local_path(remote_path);
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
@@ -64,19 +52,14 @@ pub async fn download_layer_file<'a>(
             temp_file_path.display()
         )
     })?;
-    let mut download = storage
-        .download(&layer_storage_path)
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
-            )
-        })?;
-    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+    let mut download = storage.download(remote_path).await.with_context(|| {
         format!(
-            "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
+            "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
         )
     })?;
+    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+        format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+    })?;
 
     // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
     // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -151,12 +134,7 @@ pub async fn list_remote_timelines<'a>(
     tenant_id: TenantId,
 ) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
     let tenant_path = conf.timelines_path(&tenant_id);
-    let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
-        format!(
-            "Failed to get tenant storage path for local path '{}'",
-            tenant_path.display()
-        )
-    })?;
+    let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
     let timelines = storage
         .list_prefixes(Some(&tenant_storage_path))
@@ -218,14 +196,8 @@ pub async fn download_index_part(
     let index_part_path = conf
         .metadata_path(timeline_id, tenant_id)
         .with_file_name(IndexPart::FILE_NAME);
-    let part_storage_path = storage
-        .remote_object_id(&index_part_path)
-        .with_context(|| {
-            format!(
-                "Failed to get the index part storage path for local path '{}'",
-                index_part_path.display()
-            )
-        })
+    let part_storage_path = conf
+        .remote_path(&index_part_path)
         .map_err(DownloadError::BadInput)?;
 
     let mut index_part_download = storage.download(&part_storage_path).await?;
@@ -236,20 +208,12 @@ pub async fn download_index_part(
         &mut index_part_bytes,
     )
     .await
-    .with_context(|| {
-        format!(
-            "Failed to download an index part into file '{}'",
-            index_part_path.display()
-        )
-    })
+    .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
     .map_err(DownloadError::Other)?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| {
-            format!(
-                "Failed to deserialize index part file into file '{}'",
-                index_part_path.display()
-            )
+            format!("Failed to deserialize index part file into file {index_part_path:?}")
         })
         .map_err(DownloadError::Other)?;
 
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index a1da37b826..5560712a1b 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -2,12 +2,9 @@
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.
 
-use std::{
-    collections::{HashMap, HashSet},
-    path::{Path, PathBuf},
-};
+use std::collections::{HashMap, HashSet};
 
-use anyhow::{Context, Ok};
+use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 
@@ -15,34 +12,6 @@ use crate::tenant::metadata::TimelineMetadata;
 
 use utils::lsn::Lsn;
 
-/// Path on the remote storage, relative to some inner prefix.
-/// The prefix is an implementation detail, that allows representing local paths
-/// as the remote ones, stripping the local storage prefix away.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
-#[serde(transparent)]
-pub struct RemotePath(PathBuf);
-
-impl RemotePath {
-    pub fn new(relative_path: &Path) -> Self {
-        debug_assert!(
-            relative_path.is_relative(),
-            "Path {relative_path:?} is not relative"
-        );
-        Self(relative_path.to_path_buf())
-    }
-
-    pub fn strip_base_path(base_path: &Path, full_path: &Path) -> anyhow::Result<Self> {
-        let relative = full_path.strip_prefix(base_path).with_context(|| {
-            format!("path {full_path:?} is not relative to base {base_path:?}",)
-        })?;
-        Ok(Self::new(relative))
-    }
-
-    pub fn to_local_path(&self, base_path: &Path) -> PathBuf {
-        base_path.join(&self.0)
-    }
-}
-
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -101,19 +70,19 @@ pub struct IndexPart {
     /// Layer names, which are stored on the remote storage.
     ///
     /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<RemotePath>,
+    pub timeline_layers: HashSet<String>,
 
     /// FIXME: unused field. This should be removed, but that changes the on-disk format,
-    /// so we need to make sure we're backwards- (and maybe forwards-) compatible
+    /// so we need to make sure we're backwards-` (and maybe forwards-) compatible
     /// First pass is to move it to Optional and the next would be its removal
-    missing_layers: Option<HashSet<RemotePath>>,
+    missing_layers: Option<HashSet<String>>,
 
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
     #[serde(default)]
-    pub layer_metadata: HashMap<RemotePath, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<String, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated here for convenience.
@@ -135,14 +104,20 @@ impl IndexPart {
         disk_consistent_lsn: Lsn,
         metadata_bytes: Vec<u8>,
     ) -> Self {
-        let mut timeline_layers = HashSet::new();
-        let mut layer_metadata = HashMap::new();
+        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
+        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
 
-        separate_paths_and_metadata(
-            &layers_and_metadata,
-            &mut timeline_layers,
-            &mut layer_metadata,
-        );
+        for (remote_path, metadata) in &layers_and_metadata {
+            let metadata = IndexLayerMetadata::from(metadata);
+            match remote_path.object_name() {
+                Some(layer_name) => {
+                    timeline_layers.insert(layer_name.to_owned());
+                    layer_metadata.insert(layer_name.to_owned(), metadata);
+                }
+                // TODO move this on a type level: we know, that every layer entry does have a name
+                None => panic!("Layer {remote_path:?} has no file name, skipping"),
+            }
+        }
 
         Self {
             version: Self::LATEST_VERSION,
@@ -173,18 +148,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
     }
 }
 
-fn separate_paths_and_metadata(
-    input: &HashMap<RemotePath, LayerFileMetadata>,
-    output: &mut HashSet<RemotePath>,
-    layer_metadata: &mut HashMap<RemotePath, IndexLayerMetadata>,
-) {
-    for (path, metadata) in input {
-        let metadata = IndexLayerMetadata::from(metadata);
-        layer_metadata.insert(path.clone(), metadata);
-        output.insert(path.clone());
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -200,8 +163,8 @@ mod tests {
 
         let expected = IndexPart {
             version: 0,
-            timeline_layers: HashSet::from([RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
-            missing_layers: Some(HashSet::from([RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
+            timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
+            missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -228,13 +191,13 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: HashSet::from([RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))]),
-            missing_layers: Some(HashSet::from([RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage"))])),
+            timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
+            missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
             layer_metadata: HashMap::from([
-                (RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
+                (String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"), IndexLayerMetadata {
                     file_size: Some(25600000),
                 }),
-                (RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
+                (String::from("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: Some(9007199254741001),
@@ -264,20 +227,26 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: [RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"))].into_iter().collect(),
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string()]),
             layer_metadata: HashMap::from([
-                (RemotePath(PathBuf::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")), IndexLayerMetadata {
-                    file_size: Some(25600000),
-                }),
-                (RemotePath(PathBuf::from("not_a_real_layer_but_adding_coverage")), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: Some(9007199254741001),
-                })
+                (
+                    "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string(),
+                    IndexLayerMetadata {
+                        file_size: Some(25600000),
+                    }
+                ),
+                (
+                    "not_a_real_layer_but_adding_coverage".to_string(),
+                    IndexLayerMetadata {
+                        // serde_json should always parse this but this might be a double with jq for
+                        // example.
+                        file_size: Some(9007199254741001),
+                    }
+                )
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            missing_layers: None::<HashSet<RemotePath>>,
+            missing_layers: None,
         };
 
         let part = serde_json::from_str::<IndexPart>(example).unwrap();
diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/storage_sync2/upload.rs
index b03a0f6ce7..57a524a22d 100644
--- a/pageserver/src/storage_sync2/upload.rs
+++ b/pageserver/src/storage_sync2/upload.rs
@@ -30,12 +30,9 @@ pub(super) async fn upload_index_part<'a>(
     let index_part_path = conf
         .metadata_path(timeline_id, tenant_id)
         .with_file_name(IndexPart::FILE_NAME);
+    let storage_path = conf.remote_path(&index_part_path)?;
     storage
-        .upload_storage_object(
-            Box::new(index_part_bytes),
-            index_part_size,
-            &index_part_path,
-        )
+        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
         .await
         .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
 }
@@ -44,36 +41,26 @@ pub(super) async fn upload_index_part<'a>(
 /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
 ///
 /// On an error, bumps the retries count and reschedules the entire task.
-pub(super) async fn upload_timeline_layer(
-    storage: &GenericRemoteStorage,
-    source_path: &Path,
-    known_metadata: &LayerFileMetadata,
+pub(super) async fn upload_timeline_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    source_path: &'a Path,
+    known_metadata: &'a LayerFileMetadata,
 ) -> anyhow::Result<()> {
     fail_point!("before-upload-layer", |_| {
         bail!("failpoint before-upload-layer")
     });
-    let storage_path = storage.remote_object_id(source_path).with_context(|| {
-        format!(
-            "Failed to get the layer storage path for local path '{}'",
-            source_path.display()
-        )
-    })?;
+    let storage_path = conf.remote_path(source_path)?;
 
-    let source_file = fs::File::open(&source_path).await.with_context(|| {
-        format!(
-            "Failed to open a source file for layer '{}'",
-            source_path.display()
-        )
-    })?;
+    let source_file = fs::File::open(&source_path)
+        .await
+        .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
 
     let fs_size = source_file
         .metadata()
         .await
         .with_context(|| {
-            format!(
-                "Failed to get the source file metadata for layer '{}'",
-                source_path.display()
-            )
+            format!("Failed to get the source file metadata for layer {source_path:?}")
         })?
         .len();
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4011156ec5..3b15966352 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,7 +19,7 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::storage_sync::index::{IndexPart, RemotePath};
+use crate::storage_sync::index::IndexPart;
 use crate::storage_sync::RemoteTimelineClient;
 use crate::tenant::{
     delta_layer::{DeltaLayer, DeltaLayerWriter},
@@ -999,55 +999,9 @@ impl Timeline {
         &self,
         index_part: &IndexPart,
         remote_client: &RemoteTimelineClient,
-        mut local_filenames: HashSet<PathBuf>,
+        local_layers: HashSet<PathBuf>,
         up_to_date_disk_consistent_lsn: Lsn,
     ) -> anyhow::Result<HashSet<PathBuf>> {
-        let mut remote_filenames: HashSet<PathBuf> = HashSet::new();
-        for fname in index_part.timeline_layers.iter() {
-            remote_filenames.insert(fname.to_local_path(&PathBuf::from("")));
-        }
-
-        // Are there any local files that exist, with a size that doesn't match
-        // with the size stored in the remote index file?
-        // If so, rename_to_backup those files so that we re-download them later.
-        local_filenames.retain(|path| {
-            let layer_metadata = index_part
-                .layer_metadata
-                .get(&RemotePath::new(path))
-                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-
-            if let Some(remote_size) = layer_metadata.file_size() {
-                let local_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id).join(&path);
-                match local_path.metadata() {
-                    Ok(metadata) => {
-                        let local_size = metadata.len();
-
-                        if local_size != remote_size {
-                            warn!("removing local file \"{}\" because it has unexpected length {}; length in remote index is {}",
-                                  path.display(),
-                                  local_size,
-                                  remote_size);
-                            if let Err(err) = rename_to_backup(&local_path) {
-                                error!("could not rename file \"{}\": {:?}",
-                                       local_path.display(), err);
-                            }
-                            self.metrics.current_physical_size_gauge.sub(local_size);
-                            false
-                        } else {
-                            true
-                        }
-                    }
-                    Err(err) => {
-                        error!("could not get size of local file \"{}\": {:?}", path.display(), err);
-                        true
-                    }
-                }
-            } else {
-                true
-            }
-        });
-
         // Are we missing some files that are present in remote storage?
         // Download them now.
         // TODO Downloading many files this way is not efficient.
@@ -1056,17 +1010,63 @@ impl Timeline {
         //    b) typical case now is that there is nothing to sync, this downloads a lot
         //       1) if there was another pageserver that came and generated new files
         //       2) during attach of a timeline with big history which we currently do not do
-        for path in remote_filenames.difference(&local_filenames) {
-            let fname = path.to_str().unwrap();
-            info!("remote layer file {fname} does not exist locally");
+        let mut local_only_layers = local_layers;
+        let timeline_dir = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+        for remote_layer_name in &index_part.timeline_layers {
+            let local_layer_path = timeline_dir.join(remote_layer_name);
+            local_only_layers.remove(&local_layer_path);
 
-            let layer_metadata = index_part
+            let remote_layer_metadata = index_part
                 .layer_metadata
-                .get(&RemotePath::new(path))
+                .get(remote_layer_name)
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
-            if let Some(imgfilename) = ImageFileName::parse_str(fname) {
+            let remote_layer_path = self
+                .conf
+                .remote_path(&local_layer_path)
+                .expect("local_layer_path received from the same conf that provided a workdir");
+
+            if local_layer_path.exists() {
+                let mut already_downloaded = true;
+                // Are there any local files that exist, with a size that doesn't match
+                // with the size stored in the remote index file?
+                // If so, rename_to_backup those files so that we re-download them later.
+                if let Some(remote_size) = remote_layer_metadata.file_size() {
+                    match local_layer_path.metadata() {
+                        Ok(metadata) => {
+                            let local_size = metadata.len();
+
+                            if local_size != remote_size {
+                                warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+                                if let Err(err) = rename_to_backup(&local_layer_path) {
+                                    error!("could not rename file {local_layer_path:?}: {err:?}");
+                                } else {
+                                    self.metrics.current_physical_size_gauge.sub(local_size);
+                                    already_downloaded = false;
+                                }
+                            }
+                        }
+                        Err(err) => {
+                            error!("could not get size of local file {local_layer_path:?}: {err:?}")
+                        }
+                    }
+                }
+
+                if already_downloaded {
+                    continue;
+                }
+            } else {
+                info!("remote layer {remote_layer_path:?} does not exist locally");
+            }
+
+            let layer_name = local_layer_path
+                .file_name()
+                .and_then(|os_str| os_str.to_str())
+                .with_context(|| {
+                    format!("Layer file {local_layer_path:?} has no name in unicode")
+                })?;
+            if let Some(imgfilename) = ImageFileName::parse_str(layer_name) {
                 if imgfilename.lsn > up_to_date_disk_consistent_lsn {
                     warn!(
                         "found future image layer {} on timeline {} remote_consistent_lsn is {}",
@@ -1075,11 +1075,13 @@ impl Timeline {
                     continue;
                 }
 
-                trace!("downloading image file: {path:?}");
-                let sz = remote_client
-                    .download_layer_file(&RemotePath::new(path), &layer_metadata)
+                trace!("downloading image file: {remote_layer_path:?}");
+                let downloaded_size = remote_client
+                    .download_layer_file(&remote_layer_path, &remote_layer_metadata)
                     .await
-                    .context("download image layer")?;
+                    .with_context(|| {
+                        format!("failed to download image layer from path {remote_layer_path:?}")
+                    })?;
                 trace!("done");
 
                 let image_layer =
@@ -1089,8 +1091,10 @@ impl Timeline {
                     .write()
                     .unwrap()
                     .insert_historic(Arc::new(image_layer));
-                self.metrics.current_physical_size_gauge.add(sz);
-            } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
+                self.metrics
+                    .current_physical_size_gauge
+                    .add(downloaded_size);
+            } else if let Some(deltafilename) = DeltaFileName::parse_str(layer_name) {
                 // Create a DeltaLayer struct for each delta file.
                 // The end-LSN is exclusive, while disk_consistent_lsn is
                 // inclusive. For example, if disk_consistent_lsn is 100, it is
@@ -1105,11 +1109,13 @@ impl Timeline {
                     continue;
                 }
 
-                trace!("downloading delta file: {path:?}");
+                trace!("downloading delta file: {remote_layer_path:?}");
                 let sz = remote_client
-                    .download_layer_file(&RemotePath::new(path), &layer_metadata)
+                    .download_layer_file(&remote_layer_path, &remote_layer_metadata)
                     .await
-                    .context("download delta layer")?;
+                    .with_context(|| {
+                        format!("failed to download delta layer from path {remote_layer_path:?}")
+                    })?;
                 trace!("done");
 
                 let delta_layer =
@@ -1121,16 +1127,11 @@ impl Timeline {
                     .insert_historic(Arc::new(delta_layer));
                 self.metrics.current_physical_size_gauge.add(sz);
             } else {
-                bail!("unexpected layer filename in remote storage: {}", fname);
+                bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}");
             }
         }
 
-        // now these are local only filenames
-        let local_only_filenames = local_filenames
-            .difference(&remote_filenames)
-            .cloned()
-            .collect();
-        Ok(local_only_filenames)
+        Ok(local_only_layers)
     }
 
     ///
@@ -1164,47 +1165,46 @@ impl Timeline {
         let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
 
         // Build a map of local layers for quick lookups
-        let mut local_filenames: HashSet<PathBuf> = HashSet::new();
-        for layer in self.layers.read().unwrap().iter_historic_layers() {
-            local_filenames.insert(layer.filename());
-        }
+        let local_layers = self
+            .layers
+            .read()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|historic_layer| {
+                historic_layer
+                    .local_path()
+                    .expect("Historic layers should have a path")
+            })
+            .collect::<HashSet<_>>();
 
-        let local_only_filenames = match index_part {
+        let local_only_layers = match index_part {
             Some(index_part) => {
                 info!(
                     "initializing upload queue from remote index with {} layer files",
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
-                let local_only_filenames = self
-                    .download_missing(
-                        index_part,
-                        remote_client,
-                        local_filenames,
-                        disk_consistent_lsn,
-                    )
-                    .await?;
-                local_only_filenames
+                self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
+                    .await?
             }
             None => {
                 info!("initializing upload queue as empty");
                 remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
-                local_filenames
+                local_layers
             }
         };
 
         // Are there local files that don't exist remotely? Schedule uploads for them
-        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
-        for fname in &local_only_filenames {
-            let absolute = timeline_path.join(fname);
-            let sz = absolute
+        for layer_path in &local_only_layers {
+            let layer_size = layer_path
                 .metadata()
-                .with_context(|| format!("failed to get file {} metadata", fname.display()))?
+                .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
                 .len();
-            info!("scheduling {} for upload", fname.display());
-            remote_client.schedule_layer_file_upload(&absolute, &LayerFileMetadata::new(sz))?;
+            info!("scheduling {layer_path:?} for upload");
+            remote_client
+                .schedule_layer_file_upload(layer_path, &LayerFileMetadata::new(layer_size))?;
         }
-        if !local_only_filenames.is_empty() {
+        if !local_only_layers.is_empty() {
             remote_client.schedule_index_upload(up_to_date_metadata)?;
         }
 
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 576a02c686..a3481430d0 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -226,6 +226,7 @@ impl ReplicationConn {
             let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn);
 
             let mut wal_reader = WalReader::new(
+                spg.conf.workdir.clone(),
                 spg.conf.timeline_dir(&tli.ttid),
                 &persisted_state,
                 start_pos,
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 0a43d6085c..300e9a1cba 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -13,7 +13,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
 use tokio::runtime::Builder;
 
@@ -151,7 +151,7 @@ async fn update_task(
             let timeline_dir = conf.timeline_dir(&ttid);
 
             let handle = tokio::spawn(
-                backup_task_main(ttid, timeline_dir, shutdown_rx)
+                backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx)
                     .instrument(info_span!("WAL backup task", ttid = %ttid)),
             );
 
@@ -182,10 +182,10 @@ async fn wal_backup_launcher_main_loop(
 
     let conf_ = conf.clone();
     REMOTE_STORAGE.get_or_init(|| {
-        conf_.remote_storage.as_ref().map(|c| {
-            GenericRemoteStorage::from_config(conf_.workdir, c)
-                .expect("failed to create remote storage")
-        })
+        conf_
+            .remote_storage
+            .as_ref()
+            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
     });
 
     // Presense in this map means launcher is aware s3 offloading is needed for
@@ -234,6 +234,7 @@ async fn wal_backup_launcher_main_loop(
 struct WalBackupTask {
     timeline: Arc<Timeline>,
     timeline_dir: PathBuf,
+    workspace_dir: PathBuf,
     wal_seg_size: usize,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }
@@ -242,6 +243,7 @@ struct WalBackupTask {
 async fn backup_task_main(
     ttid: TenantTimelineId,
     timeline_dir: PathBuf,
+    workspace_dir: PathBuf,
     mut shutdown_rx: Receiver<()>,
 ) {
     info!("started");
@@ -257,6 +259,7 @@ async fn backup_task_main(
         commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
         timeline: tli,
         timeline_dir,
+        workspace_dir,
     };
 
     // task is spinned up only when wal_seg_size already initialized
@@ -321,6 +324,7 @@ impl WalBackupTask {
                 commit_lsn,
                 self.wal_seg_size,
                 &self.timeline_dir,
+                &self.workspace_dir,
             )
             .await
             {
@@ -353,11 +357,12 @@ pub async fn backup_lsn_range(
     end_lsn: Lsn,
     wal_seg_size: usize,
     timeline_dir: &Path,
+    workspace_dir: &Path,
 ) -> Result<Lsn> {
     let mut res = start_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
     for s in &segments {
-        backup_single_segment(s, timeline_dir)
+        backup_single_segment(s, timeline_dir, workspace_dir)
             .await
             .with_context(|| format!("offloading segno {}", s.seg_no))?;
 
@@ -372,11 +377,24 @@ pub async fn backup_lsn_range(
     Ok(res)
 }
 
-async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> {
-    let segment_file_name = seg.file_path(timeline_dir)?;
+async fn backup_single_segment(
+    seg: &Segment,
+    timeline_dir: &Path,
+    workspace_dir: &Path,
+) -> Result<()> {
+    let segment_file_path = seg.file_path(timeline_dir)?;
+    let remote_segment_path = segment_file_path
+        .strip_prefix(&workspace_dir)
+        .context("Failed to strip workspace dir prefix")
+        .and_then(RemotePath::new)
+        .with_context(|| {
+            format!(
+                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
+            )
+        })?;
 
-    backup_object(&segment_file_name, seg.size()).await?;
-    debug!("Backup of {} done", segment_file_name.display());
+    backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?;
+    debug!("Backup of {} done", segment_file_path.display());
 
     Ok(())
 }
@@ -426,7 +444,7 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
 
 static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
 
-async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
+async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize) -> Result<()> {
     let storage = REMOTE_STORAGE
         .get()
         .expect("failed to get remote storage")
@@ -441,12 +459,12 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
     })?);
 
     storage
-        .upload_storage_object(Box::new(file), size, source_file)
+        .upload_storage_object(Box::new(file), size, target_file)
         .await
 }
 
 pub async fn read_object(
-    file_path: PathBuf,
+    file_path: &RemotePath,
     offset: u64,
 ) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
     let storage = REMOTE_STORAGE
@@ -455,19 +473,13 @@ pub async fn read_object(
         .as_ref()
         .context("No remote storage configured")?;
 
-    info!(
-        "segment download about to start for local path {} at offset {}",
-        file_path.display(),
-        offset
-    );
+    info!("segment download about to start from remote path {file_path:?} at offset {offset}");
+
     let download = storage
-        .download_storage_object(Some((offset, None)), &file_path)
+        .download_storage_object(Some((offset, None)), file_path)
         .await
         .with_context(|| {
-            format!(
-                "Failed to open WAL segment download stream for local path {}",
-                file_path.display()
-            )
+            format!("Failed to open WAL segment download stream for remote path {file_path:?}")
         })?;
 
     Ok(download.download_stream)
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index bc5e2d7b24..52368bb719 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -8,6 +8,7 @@
 //! Note that last file has `.partial` suffix, that's different from postgres.
 
 use anyhow::{bail, Context, Result};
+use remote_storage::RemotePath;
 
 use std::io::{self, Seek, SeekFrom};
 use std::pin::Pin;
@@ -445,6 +446,7 @@ fn remove_segments_from_disk(
 }
 
 pub struct WalReader {
+    workdir: PathBuf,
     timeline_dir: PathBuf,
     wal_seg_size: usize,
     pos: Lsn,
@@ -459,6 +461,7 @@ pub struct WalReader {
 
 impl WalReader {
     pub fn new(
+        workdir: PathBuf,
         timeline_dir: PathBuf,
         state: &SafeKeeperState,
         start_pos: Lsn,
@@ -478,6 +481,7 @@ impl WalReader {
         }
 
         Ok(Self {
+            workdir,
             timeline_dir,
             wal_seg_size: state.server.wal_seg_size as usize,
             pos: start_pos,
@@ -545,7 +549,17 @@ impl WalReader {
 
         // Try to open remote file, if remote reads are enabled
         if self.enable_remote_read {
-            return read_object(wal_file_path, xlogoff as u64).await;
+            let remote_wal_file_path = wal_file_path
+                .strip_prefix(&self.workdir)
+                .context("Failed to strip workdir prefix")
+                .and_then(RemotePath::new)
+                .with_context(|| {
+                    format!(
+                        "Failed to resolve remote part of path {:?} for base {:?}",
+                        wal_file_path, self.workdir,
+                    )
+                })?;
+            return read_object(&remote_wal_file_path, xlogoff as u64).await;
         }
 
         bail!("WAL segment is not found")

From e1ef62f08657a32accc42f38583cd6a2ffe25bd4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 8 Dec 2022 09:12:38 +0200
Subject: [PATCH 029/167] Print more information about context of failed
 walredo requests (#3003)

---
 pageserver/benches/bench_walredo.rs |  2 +-
 pageserver/src/tenant.rs            |  2 +-
 pageserver/src/tenant/timeline.rs   |  6 ++----
 pageserver/src/walredo.rs           | 16 ++++++++++++----
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 85caa565fe..8f53fce027 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -431,7 +431,7 @@ fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
 struct Request {
     key: Key,
     lsn: Lsn,
-    base_img: Option<Bytes>,
+    base_img: Option<(Lsn, Bytes)>,
     records: Vec<(Lsn, NeonWalRecord)>,
     pg_version: u32,
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 88476cf7b6..a7601ba2a7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2669,7 +2669,7 @@ pub mod harness {
             &self,
             key: Key,
             lsn: Lsn,
-            base_img: Option<Bytes>,
+            base_img: Option<(Lsn, Bytes)>,
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
         ) -> Result<Bytes, WalRedoError> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3b15966352..ec8049bcea 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2642,24 +2642,22 @@ impl Timeline {
                     data.records.len()
                 );
             } else {
-                let base_img = if let Some((_lsn, img)) = data.img {
+                if data.img.is_some() {
                     trace!(
                         "found {} WAL records and a base image for {} at {}, performing WAL redo",
                         data.records.len(),
                         key,
                         request_lsn
                     );
-                    Some(img)
                 } else {
                     trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
-                    None
                 };
 
                 let last_rec_lsn = data.records.last().unwrap().0;
 
                 let img = self
                     .walredo_mgr
-                    .request_redo(key, request_lsn, base_img, data.records, self.pg_version)
+                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                     .context("Failed to reconstruct a page image:")?;
 
                 if img.len() == page_cache::PAGE_SZ {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 378f8deed7..ca7cfb7413 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync {
         &self,
         key: Key,
         lsn: Lsn,
-        base_img: Option<Bytes>,
+        base_img: Option<(Lsn, Bytes)>,
         records: Vec<(Lsn, NeonWalRecord)>,
         pg_version: u32,
     ) -> Result<Bytes, WalRedoError>;
@@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager {
         &self,
         key: Key,
         lsn: Lsn,
-        base_img: Option<Bytes>,
+        base_img: Option<(Lsn, Bytes)>,
         records: Vec<(Lsn, NeonWalRecord)>,
         pg_version: u32,
     ) -> Result<Bytes, WalRedoError> {
@@ -156,7 +156,8 @@ impl WalRedoManager for PostgresRedoManager {
             return Err(WalRedoError::InvalidRequest);
         }
 
-        let mut img: Option<Bytes> = base_img;
+        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
+        let mut img = base_img.map(|p| p.1);
         let mut batch_neon = can_apply_in_neon(&records[0].1);
         let mut batch_start = 0;
         for i in 1..records.len() {
@@ -170,6 +171,7 @@ impl WalRedoManager for PostgresRedoManager {
                         key,
                         lsn,
                         img,
+                        base_img_lsn,
                         &records[batch_start..i],
                         self.conf.wal_redo_timeout,
                         pg_version,
@@ -189,6 +191,7 @@ impl WalRedoManager for PostgresRedoManager {
                 key,
                 lsn,
                 img,
+                base_img_lsn,
                 &records[batch_start..],
                 self.conf.wal_redo_timeout,
                 pg_version,
@@ -223,11 +226,13 @@ impl PostgresRedoManager {
     ///
     /// Process one request for WAL redo using wal-redo postgres
     ///
+    #[allow(clippy::too_many_arguments)]
     fn apply_batch_postgres(
         &self,
         key: Key,
         lsn: Lsn,
         base_img: Option<Bytes>,
+        base_img_lsn: Lsn,
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
         pg_version: u32,
@@ -282,9 +287,12 @@ impl PostgresRedoManager {
         // next request will launch a new one.
         if result.is_err() {
             error!(
-                "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
+                "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
                 records.len(),
+				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+				records.last().map(|p| p.0).unwrap_or(Lsn(0)),
                 nbytes,
+				base_img_lsn,
                 lsn
             );
             let process = process_guard.take().unwrap();

From 0d04cd0b99a5478f2d0849dd1f463d1cc5d2eea9 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 8 Dec 2022 09:49:43 +0100
Subject: [PATCH 030/167] Run compaction on the buffer holding received buffers
 when useful (#3028)

This cleans up unused entries and reduces the chance of prefetch buffer
thrashing.
---
 pgxn/neon/pagestore_smgr.c | 140 ++++++++++++++++++++++++++++++++-----
 1 file changed, 122 insertions(+), 18 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 434a1c2b85..76f53aae0b 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -242,6 +242,14 @@ PrefetchState *MyPState;
 	) \
 )
 
+#define ReceiveBufferNeedsCompaction() (\
+	(MyPState->n_responses_buffered / 8) < ( \
+		MyPState->ring_receive - \
+			MyPState->ring_last - \
+			MyPState->n_responses_buffered \
+	) \
+)
+
 int			n_prefetch_hits = 0;
 int			n_prefetch_misses = 0;
 int			n_prefetch_missed_caches = 0;
@@ -249,17 +257,99 @@ int			n_prefetch_dupes = 0;
 
 XLogRecPtr	prefetch_lsn = 0;
 
+static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
 static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
 static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
 static bool prefetch_wait_for(uint64 ring_index);
-static void prefetch_cleanup(void);
+static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
 static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
 									   ForkNumber forknum, BlockNumber blkno);
 
+static bool
+compact_prefetch_buffers(void)
+{
+	uint64	empty_ring_index = MyPState->ring_last;
+	uint64	search_ring_index = MyPState->ring_receive;
+	int n_moved = 0;
+	
+	if (MyPState->ring_receive == MyPState->ring_last)
+		return false;
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		search_ring_index--;
+		if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
+		{
+			empty_ring_index = search_ring_index;
+			break;
+		}
+	}
+
+	/*
+	 * Here we have established:
+	 * slots < search_ring_index may be unused (not scanned)
+	 * slots >= search_ring_index and <= empty_ring_index are unused
+	 * slots > empty_ring_index are in use, or outside our buffer's range.
+	 * 
+	 * Therefore, there is a gap of at least one unused items between
+	 * search_ring_index and empty_ring_index, which grows as we hit
+	 * more unused items while moving backwards through the array.
+	 */
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		PrefetchRequest *source_slot;
+		PrefetchRequest *target_slot;
+		bool		found;
+
+		search_ring_index--;
+
+		source_slot = GetPrfSlot(search_ring_index);
+
+		if (source_slot->status == PRFS_UNUSED)
+			continue;
+
+		target_slot = GetPrfSlot(empty_ring_index);
+
+		Assert(source_slot->status == PRFS_RECEIVED);
+		Assert(target_slot->status == PRFS_UNUSED);
+
+		target_slot->buftag = source_slot->buftag;
+		target_slot->status = source_slot->status;
+		target_slot->response = source_slot->response;
+		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
+		target_slot->my_ring_index = empty_ring_index;
+
+		prfh_delete(MyPState->prf_hash, source_slot);
+		prfh_insert(MyPState->prf_hash, target_slot, &found);
+
+		Assert(!found);
+
+		/* Adjust the location of our known-empty slot */
+		empty_ring_index--;
+
+		source_slot->status = PRFS_UNUSED;
+		source_slot->buftag = (BufferTag) {0};
+		source_slot->response = NULL;
+		source_slot->my_ring_index = 0;
+		source_slot->effective_request_lsn = 0;
+
+		n_moved++;
+	}
+
+	if (MyPState->ring_last != empty_ring_index)
+	{
+		MyPState->ring_last = empty_ring_index;
+		return true;
+	}
+
+	return false;
+}
+
 void
 readahead_buffer_resize(int newsize, void *extra)
 {
@@ -323,7 +413,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		prfh_insert(newPState->prf_hash, newslot, &found);
 
 		Assert(!found);
-		
+
 		switch (newslot->status)
 		{
 			case PRFS_UNUSED:
@@ -370,7 +460,7 @@ consume_prefetch_responses(void)
 }
 
 static void
-prefetch_cleanup(void)
+prefetch_cleanup_trailing_unused(void)
 {
 	uint64	ring_index;
 	PrefetchRequest *slot;
@@ -531,7 +621,10 @@ prefetch_set_unused(uint64 ring_index)
 
 	/* run cleanup if we're holding back ring_last */
 	if (MyPState->ring_last == ring_index)
-		prefetch_cleanup();
+		prefetch_cleanup_trailing_unused();
+	/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
+	else if (ReceiveBufferNeedsCompaction())
+		compact_prefetch_buffers();
 }
 
 static void
@@ -702,20 +795,31 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 
 		Assert(slot->status != PRFS_UNUSED);
 
-		/* We have the slot for ring_last, so that must still be in progress */
-		switch (slot->status)
+		/*
+		 * If there is good reason to run compaction on the prefetch buffers,
+		 * try to do that.
+		 */
+		if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
 		{
-			case PRFS_REQUESTED:
-				Assert(MyPState->ring_receive == cleanup_index);
-				prefetch_wait_for(cleanup_index);
-				prefetch_set_unused(cleanup_index);
-				break;
-			case PRFS_RECEIVED:
-			case PRFS_TAG_REMAINS:
-				prefetch_set_unused(cleanup_index);
-				break;
-			default:
-				pg_unreachable();
+			Assert(slot->status == PRFS_UNUSED);
+		}
+		else
+		{
+			/* We have the slot for ring_last, so that must still be in progress */
+			switch (slot->status)
+			{
+				case PRFS_REQUESTED:
+					Assert(MyPState->ring_receive == cleanup_index);
+					prefetch_wait_for(cleanup_index);
+					prefetch_set_unused(cleanup_index);
+					break;
+				case PRFS_RECEIVED:
+				case PRFS_TAG_REMAINS:
+					prefetch_set_unused(cleanup_index);
+					break;
+				default:
+					pg_unreachable();
+			}
 		}
 	}
 
@@ -1816,7 +1920,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 
 	/* buffer was used, clean up for later reuse */
 	prefetch_set_unused(ring_index);
-	prefetch_cleanup();
+	prefetch_cleanup_trailing_unused();
 }
 
 /*

From f5a735ac3b2c9c03f49c5140b53bc2c9b8236189 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 8 Dec 2022 12:24:24 +0100
Subject: [PATCH 031/167] Add proxy and broker to us-west-2 (#3027)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
---
 .../prod-us-west-2-eta.neon-proxy-scram.yaml  | 31 +++++++++++
 ...rod-us-west-2-eta.neon-storage-broker.yaml | 53 +++++++++++++++++++
 .github/workflows/build_and_test.yml          |  4 ++
 3 files changed, 88 insertions(+)
 create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
 create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml

diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
new file mode 100644
index 0000000000..1747cb95b1
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.us-west-2.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-west-2
+  zenith_region_slug: us-west-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
new file mode 100644
index 0000000000..1c7cbfd44e
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker-eta.us-west-2.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker-eta.us-west-2.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a432e875dd..49f94ad60e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1031,6 +1031,8 @@ jobs:
         include:
           - target_region:  us-east-2
             target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
           - target_region: eu-central-1
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
@@ -1068,6 +1070,8 @@ jobs:
         include:
           - target_region:  us-east-2
             target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
           - target_region: eu-central-1
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1

From 2baf6c09a842115dc30d007d6fff744037ec687c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 8 Dec 2022 15:17:28 +0400
Subject: [PATCH 032/167] Some more allowed pageserver errors.

https://neondb.slack.com/archives/C033RQ5SPDH/p1670497680293859
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6fae448794..cda518f26c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1751,6 +1751,7 @@ class NeonPageserver(PgProtocol):
             ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
             ".*kill_and_wait_impl.*: wait successful.*",
             ".*end streaming to Some.*",
+            ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
             ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
@@ -1776,6 +1777,7 @@ class NeonPageserver(PgProtocol):
             ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
             ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
+            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
         ]
 
     def start(

From 4de42172472dbc6e97e581891e6a75f0b39741c0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 8 Dec 2022 14:50:59 +0000
Subject: [PATCH 033/167] Bump certifi from 2022.9.24 to 2022.12.7 (#3033)

---
 poetry.lock | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 716423d51e..2fa7f03679 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -525,7 +525,7 @@ typing-extensions = ">=4.1.0"
 
 [[package]]
 name = "certifi"
-version = "2022.9.24"
+version = "2022.12.7"
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 optional = false
@@ -1248,8 +1248,8 @@ python-versions = ">=3.6"
 
 [package.dependencies]
 pytest = [
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
     {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]
 
 [[package]]
@@ -1702,8 +1702,8 @@ botocore-stubs = [
     {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
 ]
 certifi = [
-    {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"},
-    {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"},
+    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
+    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
 ]
 cffi = [
     {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
@@ -2036,6 +2036,7 @@ psutil = [
 psycopg2-binary = [
     {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -2069,6 +2070,7 @@ psycopg2-binary = [
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -2080,6 +2082,7 @@ psycopg2-binary = [
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -2096,18 +2099,7 @@ py = [
     {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
@@ -2213,6 +2205,13 @@ pyyaml = [
     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
     {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
     {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
     {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},

From 5c701f9a754cc9600eff34dc0e1fa1db761742c2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 8 Dec 2022 15:13:40 +0000
Subject: [PATCH 034/167] merge-allure-report: create report even if benchmarks
 is skipped (#3029)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 49f94ad60e..49187ab64a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -305,7 +305,7 @@ jobs:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
     needs: [ regress-tests, benchmarks ]
-    if: success() || failure()
+    if: ${{ !cancelled() }}
     strategy:
       fail-fast: false
       matrix:

From a19c4877662f6b244d4bbadd7350d1d480a37a8e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 8 Dec 2022 15:32:49 +0000
Subject: [PATCH 035/167] Nightly Benchmarks: add TPC-H benchmark (#2978)

Ref: https://www.tpc.org/tpch/
---
 .github/workflows/benchmarking.yml            | 115 +++++++++++++++++-
 test_runner/performance/test_perf_olap.py     |  35 ++++++
 .../performance/tpc-h/create-indexes.sql      |  43 +++++++
 .../performance/tpc-h/create-schema.sql       |  69 +++++++++++
 test_runner/performance/tpc-h/queries/1.sql   |  27 ++++
 test_runner/performance/tpc-h/queries/10.sql  |  38 ++++++
 test_runner/performance/tpc-h/queries/11.sql  |  34 ++++++
 test_runner/performance/tpc-h/queries/12.sql  |  35 ++++++
 test_runner/performance/tpc-h/queries/13.sql  |  27 ++++
 test_runner/performance/tpc-h/queries/14.sql  |  20 +++
 test_runner/performance/tpc-h/queries/15.sql  |  40 ++++++
 test_runner/performance/tpc-h/queries/16.sql  |  37 ++++++
 test_runner/performance/tpc-h/queries/17.sql  |  25 ++++
 test_runner/performance/tpc-h/queries/18.sql  |  39 ++++++
 test_runner/performance/tpc-h/queries/19.sql  |  42 +++++++
 test_runner/performance/tpc-h/queries/2.sql   |  50 ++++++++
 test_runner/performance/tpc-h/queries/20.sql  |  44 +++++++
 test_runner/performance/tpc-h/queries/21.sql  |  46 +++++++
 test_runner/performance/tpc-h/queries/22.sql  |  44 +++++++
 test_runner/performance/tpc-h/queries/3.sql   |  29 +++++
 test_runner/performance/tpc-h/queries/4.sql   |  28 +++++
 test_runner/performance/tpc-h/queries/5.sql   |  31 +++++
 test_runner/performance/tpc-h/queries/6.sql   |  16 +++
 test_runner/performance/tpc-h/queries/7.sql   |  46 +++++++
 test_runner/performance/tpc-h/queries/8.sql   |  44 +++++++
 test_runner/performance/tpc-h/queries/9.sql   |  39 ++++++
 26 files changed, 1042 insertions(+), 1 deletion(-)
 create mode 100644 test_runner/performance/tpc-h/create-indexes.sql
 create mode 100644 test_runner/performance/tpc-h/create-schema.sql
 create mode 100644 test_runner/performance/tpc-h/queries/1.sql
 create mode 100644 test_runner/performance/tpc-h/queries/10.sql
 create mode 100644 test_runner/performance/tpc-h/queries/11.sql
 create mode 100644 test_runner/performance/tpc-h/queries/12.sql
 create mode 100644 test_runner/performance/tpc-h/queries/13.sql
 create mode 100644 test_runner/performance/tpc-h/queries/14.sql
 create mode 100644 test_runner/performance/tpc-h/queries/15.sql
 create mode 100644 test_runner/performance/tpc-h/queries/16.sql
 create mode 100644 test_runner/performance/tpc-h/queries/17.sql
 create mode 100644 test_runner/performance/tpc-h/queries/18.sql
 create mode 100644 test_runner/performance/tpc-h/queries/19.sql
 create mode 100644 test_runner/performance/tpc-h/queries/2.sql
 create mode 100644 test_runner/performance/tpc-h/queries/20.sql
 create mode 100644 test_runner/performance/tpc-h/queries/21.sql
 create mode 100644 test_runner/performance/tpc-h/queries/22.sql
 create mode 100644 test_runner/performance/tpc-h/queries/3.sql
 create mode 100644 test_runner/performance/tpc-h/queries/4.sql
 create mode 100644 test_runner/performance/tpc-h/queries/5.sql
 create mode 100644 test_runner/performance/tpc-h/queries/6.sql
 create mode 100644 test_runner/performance/tpc-h/queries/7.sql
 create mode 100644 test_runner/performance/tpc-h/queries/8.sql
 create mode 100644 test_runner/performance/tpc-h/queries/9.sql

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ec2bea9058..eb9ba70371 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -385,7 +385,7 @@ jobs:
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
 
-    - name: Benchmark clickbench
+    - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
       with:
         build_type: ${{ env.BUILD_TYPE }}
@@ -413,3 +413,116 @@ jobs:
         slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  tpch-compare:
+    # TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters
+    # we use for performance testing in pgbench-compare & clickbench-compare.
+    # Run this job only when clickbench-compare is finished to avoid the intersection.
+    # We might change it after https://github.com/neondatabase/neon/issues/2900.
+    #
+    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
+    if: success() || failure()
+    needs: [ clickbench-compare ]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      PLATFORM: ${{ matrix.platform }}
+
+    # NOTE: Here we use non-standadard `captest` runner (instead of `dev`) which is located in us-east-2 region.
+    #       We will move the rest of benchmarking jobs to staging in https://github.com/neondatabase/neon/pull/2838
+    runs-on: [ self-hosted, captest, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      options: --init
+
+    timeout-minutes: 360 # 6h
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${PLATFORM}" in
+          neon-captest-prefetch)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
+            ;;
+          rds-aurora)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
+            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
+            ;;
+          *)
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        psql ${CONNSTR} -c "SELECT version();"
+
+    - name: Set database options
+      if: matrix.platform == 'neon-captest-prefetch'
+      run: |
+        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
+
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Run TPC-H benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Create Allure report
+      if: success() || failure()
+      uses: ./.github/actions/allure-report
+      with:
+        action: generate
+        build_type: ${{ env.BUILD_TYPE }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 84693325c0..8c3b6e57ff 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -2,8 +2,10 @@ from dataclasses import dataclass
 from typing import Dict, Tuple
 
 import pytest
+from _pytest.mark import ParameterSet
 from fixtures.compare_fixtures import RemoteCompare
 from fixtures.log_helper import log
+from fixtures.utils import get_self_dir
 
 
 @dataclass
@@ -109,3 +111,36 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare):
     """
 
     run_psql(remote_compare, query, times=3)
+
+
+def tpch_queuies() -> Tuple[ParameterSet, ...]:
+    """
+    A list of queries to run for the TPC-H benchmark.
+    - querues in returning tuple are ordered by the query number
+    - pytest parameters id is adjusted to match the query id (the numbering starts from 1)
+    """
+    queries_dir = get_self_dir().parent / "performance" / "tpc-h" / "queries"
+    assert queries_dir.exists(), f"TPC-H queries dir not found: {queries_dir}"
+
+    return tuple(
+        pytest.param(LabelledQuery(f"Q{f.stem}", f.read_text()), id=f"query{f.stem}")
+        for f in sorted(queries_dir.glob("*.sql"), key=lambda f: int(f.stem))
+    )
+
+
+@pytest.mark.parametrize("query", tpch_queuies())
+@pytest.mark.remote_cluster
+def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare):
+    """
+    TCP-H Benchmark
+
+    The DB prepared manually in advance:
+    - schema: test_runner/performance/tpc-h/create-schema.sql
+    - indexes: test_runner/performance/tpc-h/create-indexes.sql
+    - data generated by `dbgen` program of the official TPC-H benchmark
+    - `VACUUM (FREEZE, PARALLEL 0);`
+
+    For query generation `1669822882` is used as a seed to the RNG
+    """
+
+    run_psql(remote_compare, query, times=1)
diff --git a/test_runner/performance/tpc-h/create-indexes.sql b/test_runner/performance/tpc-h/create-indexes.sql
new file mode 100644
index 0000000000..590a9c1900
--- /dev/null
+++ b/test_runner/performance/tpc-h/create-indexes.sql
@@ -0,0 +1,43 @@
+-- Section 1.4.2.2
+
+ALTER TABLE part ADD PRIMARY KEY (p_partkey);
+ALTER TABLE supplier ADD PRIMARY KEY (s_suppkey);
+ALTER TABLE partsupp ADD PRIMARY KEY (ps_partkey, ps_suppkey);
+ALTER TABLE customer ADD PRIMARY KEY (c_custkey);
+ALTER TABLE orders ADD PRIMARY KEY (o_orderkey);
+ALTER TABLE lineitem ADD PRIMARY KEY (l_orderkey, l_linenumber);
+ALTER TABLE nation ADD PRIMARY KEY (n_nationkey);
+ALTER TABLE region ADD PRIMARY KEY (r_regionkey);
+
+-- Section 1.4.2.3
+
+CREATE INDEX ON supplier USING btree (s_nationkey);
+ALTER TABLE supplier ADD FOREIGN KEY (s_nationkey) REFERENCES nation (n_nationkey);
+
+/* IGNORE: implied by primary key */
+-- CREATE INDEX ON partsupp USING btree (ps_partkey);
+CREATE INDEX ON partsupp USING btree (ps_suppkey);
+ALTER TABLE partsupp ADD FOREIGN KEY (ps_partkey) REFERENCES part (p_partkey);
+ALTER TABLE partsupp ADD FOREIGN KEY (ps_suppkey) REFERENCES supplier (s_suppkey);
+
+CREATE INDEX ON customer USING btree (c_nationkey);
+ALTER TABLE customer ADD FOREIGN KEY (c_nationkey) REFERENCES nation (n_nationkey);
+
+CREATE INDEX ON orders USING btree (o_custkey);
+ALTER TABLE orders ADD FOREIGN KEY (o_custkey) REFERENCES customer (c_custkey);
+
+/* IGNORE: implied by primary key */
+-- CREATE INDEX ON lineitem USING btree (l_orderkey);
+CREATE INDEX ON lineitem USING btree (l_partkey, l_suppkey);
+CREATE INDEX ON lineitem USING btree (l_suppkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_orderkey) REFERENCES orders (o_orderkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_partkey) REFERENCES part (p_partkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_suppkey) REFERENCES supplier (s_suppkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp (ps_partkey, ps_suppkey);
+
+CREATE INDEX ON nation USING btree (n_regionkey);
+ALTER TABLE nation ADD FOREIGN KEY (n_regionkey) REFERENCES region (r_regionkey);
+
+-- Section 1.4.2.4
+
+ALTER TABLE lineitem ADD CHECK (l_shipdate <= l_receiptdate);
diff --git a/test_runner/performance/tpc-h/create-schema.sql b/test_runner/performance/tpc-h/create-schema.sql
new file mode 100644
index 0000000000..4293951aa1
--- /dev/null
+++ b/test_runner/performance/tpc-h/create-schema.sql
@@ -0,0 +1,69 @@
+-- Sccsid:     @(#)dss.ddl	2.1.8.1
+CREATE TABLE NATION  ( N_NATIONKEY  INTEGER NOT NULL,
+                            N_NAME       CHAR(25) NOT NULL,
+                            N_REGIONKEY  INTEGER NOT NULL,
+                            N_COMMENT    VARCHAR(152));
+
+CREATE TABLE REGION  ( R_REGIONKEY  INTEGER NOT NULL,
+                            R_NAME       CHAR(25) NOT NULL,
+                            R_COMMENT    VARCHAR(152));
+
+CREATE TABLE PART  ( P_PARTKEY     INTEGER NOT NULL,
+                          P_NAME        VARCHAR(55) NOT NULL,
+                          P_MFGR        CHAR(25) NOT NULL,
+                          P_BRAND       CHAR(10) NOT NULL,
+                          P_TYPE        VARCHAR(25) NOT NULL,
+                          P_SIZE        INTEGER NOT NULL,
+                          P_CONTAINER   CHAR(10) NOT NULL,
+                          P_RETAILPRICE DECIMAL(15,2) NOT NULL,
+                          P_COMMENT     VARCHAR(23) NOT NULL );
+
+CREATE TABLE SUPPLIER ( S_SUPPKEY     INTEGER NOT NULL,
+                             S_NAME        CHAR(25) NOT NULL,
+                             S_ADDRESS     VARCHAR(40) NOT NULL,
+                             S_NATIONKEY   INTEGER NOT NULL,
+                             S_PHONE       CHAR(15) NOT NULL,
+                             S_ACCTBAL     DECIMAL(15,2) NOT NULL,
+                             S_COMMENT     VARCHAR(101) NOT NULL);
+
+CREATE TABLE PARTSUPP ( PS_PARTKEY     INTEGER NOT NULL,
+                             PS_SUPPKEY     INTEGER NOT NULL,
+                             PS_AVAILQTY    INTEGER NOT NULL,
+                             PS_SUPPLYCOST  DECIMAL(15,2)  NOT NULL,
+                             PS_COMMENT     VARCHAR(199) NOT NULL );
+
+CREATE TABLE CUSTOMER ( C_CUSTKEY     INTEGER NOT NULL,
+                             C_NAME        VARCHAR(25) NOT NULL,
+                             C_ADDRESS     VARCHAR(40) NOT NULL,
+                             C_NATIONKEY   INTEGER NOT NULL,
+                             C_PHONE       CHAR(15) NOT NULL,
+                             C_ACCTBAL     DECIMAL(15,2)   NOT NULL,
+                             C_MKTSEGMENT  CHAR(10) NOT NULL,
+                             C_COMMENT     VARCHAR(117) NOT NULL);
+
+CREATE TABLE ORDERS  ( O_ORDERKEY       INTEGER NOT NULL,
+                           O_CUSTKEY        INTEGER NOT NULL,
+                           O_ORDERSTATUS    CHAR(1) NOT NULL,
+                           O_TOTALPRICE     DECIMAL(15,2) NOT NULL,
+                           O_ORDERDATE      DATE NOT NULL,
+                           O_ORDERPRIORITY  CHAR(15) NOT NULL,
+                           O_CLERK          CHAR(15) NOT NULL,
+                           O_SHIPPRIORITY   INTEGER NOT NULL,
+                           O_COMMENT        VARCHAR(79) NOT NULL);
+
+CREATE TABLE LINEITEM ( L_ORDERKEY    INTEGER NOT NULL,
+                             L_PARTKEY     INTEGER NOT NULL,
+                             L_SUPPKEY     INTEGER NOT NULL,
+                             L_LINENUMBER  INTEGER NOT NULL,
+                             L_QUANTITY    DECIMAL(15,2) NOT NULL,
+                             L_EXTENDEDPRICE  DECIMAL(15,2) NOT NULL,
+                             L_DISCOUNT    DECIMAL(15,2) NOT NULL,
+                             L_TAX         DECIMAL(15,2) NOT NULL,
+                             L_RETURNFLAG  CHAR(1) NOT NULL,
+                             L_LINESTATUS  CHAR(1) NOT NULL,
+                             L_SHIPDATE    DATE NOT NULL,
+                             L_COMMITDATE  DATE NOT NULL,
+                             L_RECEIPTDATE DATE NOT NULL,
+                             L_SHIPINSTRUCT CHAR(25) NOT NULL,
+                             L_SHIPMODE     CHAR(10) NOT NULL,
+                             L_COMMENT      VARCHAR(44) NOT NULL);
diff --git a/test_runner/performance/tpc-h/queries/1.sql b/test_runner/performance/tpc-h/queries/1.sql
new file mode 100644
index 0000000000..2e1967fec8
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/1.sql
@@ -0,0 +1,27 @@
+-- $ID$
+-- TPC-H/TPC-R Pricing Summary Report Query (Q1)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	l_returnflag,
+	l_linestatus,
+	sum(l_quantity) as sum_qty,
+	sum(l_extendedprice) as sum_base_price,
+	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+	avg(l_quantity) as avg_qty,
+	avg(l_extendedprice) as avg_price,
+	avg(l_discount) as avg_disc,
+	count(*) as count_order
+from
+	lineitem
+where
+	l_shipdate <= date '1998-12-01' - interval '89' day
+group by
+	l_returnflag,
+	l_linestatus
+order by
+	l_returnflag,
+	l_linestatus;
diff --git a/test_runner/performance/tpc-h/queries/10.sql b/test_runner/performance/tpc-h/queries/10.sql
new file mode 100644
index 0000000000..0569e2ed86
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/10.sql
@@ -0,0 +1,38 @@
+-- $ID$
+-- TPC-H/TPC-R Returned Item Reporting Query (Q10)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	c_custkey,
+	c_name,
+	sum(l_extendedprice * (1 - l_discount)) as revenue,
+	c_acctbal,
+	n_name,
+	c_address,
+	c_phone,
+	c_comment
+from
+	customer,
+	orders,
+	lineitem,
+	nation
+where
+	c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and o_orderdate >= date '1993-08-01'
+	and o_orderdate < date '1993-08-01' + interval '3' month
+	and l_returnflag = 'R'
+	and c_nationkey = n_nationkey
+group by
+	c_custkey,
+	c_name,
+	c_acctbal,
+	c_phone,
+	n_name,
+	c_address,
+	c_comment
+order by
+	revenue desc
+limit 20;
diff --git a/test_runner/performance/tpc-h/queries/11.sql b/test_runner/performance/tpc-h/queries/11.sql
new file mode 100644
index 0000000000..f7500c260e
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/11.sql
@@ -0,0 +1,34 @@
+-- $ID$
+-- TPC-H/TPC-R Important Stock Identification Query (Q11)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	ps_partkey,
+	sum(ps_supplycost * ps_availqty) as value
+from
+	partsupp,
+	supplier,
+	nation
+where
+	ps_suppkey = s_suppkey
+	and s_nationkey = n_nationkey
+	and n_name = 'INDONESIA'
+group by
+	ps_partkey having
+		sum(ps_supplycost * ps_availqty) > (
+			select
+				sum(ps_supplycost * ps_availqty) * 0.0001000000
+			from
+				partsupp,
+				supplier,
+				nation
+			where
+				ps_suppkey = s_suppkey
+				and s_nationkey = n_nationkey
+				and n_name = 'INDONESIA'
+		)
+order by
+	value desc
+;
diff --git a/test_runner/performance/tpc-h/queries/12.sql b/test_runner/performance/tpc-h/queries/12.sql
new file mode 100644
index 0000000000..bd879321c8
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/12.sql
@@ -0,0 +1,35 @@
+-- $ID$
+-- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	l_shipmode,
+	sum(case
+		when o_orderpriority = '1-URGENT'
+			or o_orderpriority = '2-HIGH'
+			then 1
+		else 0
+	end) as high_line_count,
+	sum(case
+		when o_orderpriority <> '1-URGENT'
+			and o_orderpriority <> '2-HIGH'
+			then 1
+		else 0
+	end) as low_line_count
+from
+	orders,
+	lineitem
+where
+	o_orderkey = l_orderkey
+	and l_shipmode in ('REG AIR', 'AIR')
+	and l_commitdate < l_receiptdate
+	and l_shipdate < l_commitdate
+	and l_receiptdate >= date '1995-01-01'
+	and l_receiptdate < date '1995-01-01' + interval '1' year
+group by
+	l_shipmode
+order by
+	l_shipmode
+;
diff --git a/test_runner/performance/tpc-h/queries/13.sql b/test_runner/performance/tpc-h/queries/13.sql
new file mode 100644
index 0000000000..554b2bec92
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/13.sql
@@ -0,0 +1,27 @@
+-- $ID$
+-- TPC-H/TPC-R Customer Distribution Query (Q13)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	c_count,
+	count(*) as custdist
+from
+	(
+		select
+			c_custkey,
+			count(o_orderkey)
+		from
+			customer left outer join orders on
+				c_custkey = o_custkey
+				and o_comment not like '%special%accounts%'
+		group by
+			c_custkey
+	) as c_orders (c_custkey, c_count)
+group by
+	c_count
+order by
+	custdist desc,
+	c_count desc
+;
diff --git a/test_runner/performance/tpc-h/queries/14.sql b/test_runner/performance/tpc-h/queries/14.sql
new file mode 100644
index 0000000000..794a5656f5
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/14.sql
@@ -0,0 +1,20 @@
+-- $ID$
+-- TPC-H/TPC-R Promotion Effect Query (Q14)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	100.00 * sum(case
+		when p_type like 'PROMO%'
+			then l_extendedprice * (1 - l_discount)
+		else 0
+	end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
+from
+	lineitem,
+	part
+where
+	l_partkey = p_partkey
+	and l_shipdate >= date '1995-07-01'
+	and l_shipdate < date '1995-07-01' + interval '1' month
+;
diff --git a/test_runner/performance/tpc-h/queries/15.sql b/test_runner/performance/tpc-h/queries/15.sql
new file mode 100644
index 0000000000..5d618c9906
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/15.sql
@@ -0,0 +1,40 @@
+-- $ID$
+-- TPC-H/TPC-R Top Supplier Query (Q15)
+-- Functional Query Definition
+-- Approved February 1998
+
+create view revenue0 (supplier_no, total_revenue) as
+	select
+		l_suppkey,
+		sum(l_extendedprice * (1 - l_discount))
+	from
+		lineitem
+	where
+		l_shipdate >= date '1995-01-01'
+		and l_shipdate < date '1995-01-01' + interval '3' month
+	group by
+		l_suppkey;
+
+
+select
+	s_suppkey,
+	s_name,
+	s_address,
+	s_phone,
+	total_revenue
+from
+	supplier,
+	revenue0
+where
+	s_suppkey = supplier_no
+	and total_revenue = (
+		select
+			max(total_revenue)
+		from
+			revenue0
+	)
+order by
+	s_suppkey;
+
+drop view revenue0
+;
diff --git a/test_runner/performance/tpc-h/queries/16.sql b/test_runner/performance/tpc-h/queries/16.sql
new file mode 100644
index 0000000000..f525d55d5d
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/16.sql
@@ -0,0 +1,37 @@
+-- $ID$
+-- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	p_brand,
+	p_type,
+	p_size,
+	count(distinct ps_suppkey) as supplier_cnt
+from
+	partsupp,
+	part
+where
+	p_partkey = ps_partkey
+	and p_brand <> 'Brand#43'
+	and p_type not like 'PROMO POLISHED%'
+	and p_size in (35, 5, 42, 13, 11, 40, 50, 47)
+	and ps_suppkey not in (
+		select
+			s_suppkey
+		from
+			supplier
+		where
+			s_comment like '%Customer%Complaints%'
+	)
+group by
+	p_brand,
+	p_type,
+	p_size
+order by
+	supplier_cnt desc,
+	p_brand,
+	p_type,
+	p_size
+;
diff --git a/test_runner/performance/tpc-h/queries/17.sql b/test_runner/performance/tpc-h/queries/17.sql
new file mode 100644
index 0000000000..7d736cd3b5
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/17.sql
@@ -0,0 +1,25 @@
+
+-- $ID$
+-- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	sum(l_extendedprice) / 7.0 as avg_yearly
+from
+	lineitem,
+	part
+where
+	p_partkey = l_partkey
+	and p_brand = 'Brand#35'
+	and p_container = 'JUMBO JAR'
+	and l_quantity < (
+		select
+			0.2 * avg(l_quantity)
+		from
+			lineitem
+		where
+			l_partkey = p_partkey
+	)
+;
diff --git a/test_runner/performance/tpc-h/queries/18.sql b/test_runner/performance/tpc-h/queries/18.sql
new file mode 100644
index 0000000000..13f7ce7306
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/18.sql
@@ -0,0 +1,39 @@
+-- $ID$
+-- TPC-H/TPC-R Large Volume Customer Query (Q18)
+-- Function Query Definition
+-- Approved February 1998
+
+
+select
+	c_name,
+	c_custkey,
+	o_orderkey,
+	o_orderdate,
+	o_totalprice,
+	sum(l_quantity)
+from
+	customer,
+	orders,
+	lineitem
+where
+	o_orderkey in (
+		select
+			l_orderkey
+		from
+			lineitem
+		group by
+			l_orderkey having
+				sum(l_quantity) > 315
+	)
+	and c_custkey = o_custkey
+	and o_orderkey = l_orderkey
+group by
+	c_name,
+	c_custkey,
+	o_orderkey,
+	o_orderdate,
+	o_totalprice
+order by
+	o_totalprice desc,
+	o_orderdate
+limit 100;
diff --git a/test_runner/performance/tpc-h/queries/19.sql b/test_runner/performance/tpc-h/queries/19.sql
new file mode 100644
index 0000000000..43a64bde6f
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/19.sql
@@ -0,0 +1,42 @@
+-- $ID$
+-- TPC-H/TPC-R Discounted Revenue Query (Q19)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	sum(l_extendedprice* (1 - l_discount)) as revenue
+from
+	lineitem,
+	part
+where
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#41'
+		and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+		and l_quantity >= 10 and l_quantity <= 10 + 10
+		and p_size between 1 and 5
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+	or
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#52'
+		and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+		and l_quantity >= 20 and l_quantity <= 20 + 10
+		and p_size between 1 and 10
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+	or
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#14'
+		and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+		and l_quantity >= 22 and l_quantity <= 22 + 10
+		and p_size between 1 and 15
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+;
diff --git a/test_runner/performance/tpc-h/queries/2.sql b/test_runner/performance/tpc-h/queries/2.sql
new file mode 100644
index 0000000000..2e8164b65a
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/2.sql
@@ -0,0 +1,50 @@
+-- $ID$
+-- TPC-H/TPC-R Minimum Cost Supplier Query (Q2)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	s_acctbal,
+	s_name,
+	n_name,
+	p_partkey,
+	p_mfgr,
+	s_address,
+	s_phone,
+	s_comment
+from
+	part,
+	supplier,
+	partsupp,
+	nation,
+	region
+where
+	p_partkey = ps_partkey
+	and s_suppkey = ps_suppkey
+	and p_size = 39
+	and p_type like '%BRASS'
+	and s_nationkey = n_nationkey
+	and n_regionkey = r_regionkey
+	and r_name = 'MIDDLE EAST'
+	and ps_supplycost = (
+		select
+			min(ps_supplycost)
+		from
+			partsupp,
+			supplier,
+			nation,
+			region
+		where
+			p_partkey = ps_partkey
+			and s_suppkey = ps_suppkey
+			and s_nationkey = n_nationkey
+			and n_regionkey = r_regionkey
+			and r_name = 'MIDDLE EAST'
+	)
+order by
+	s_acctbal desc,
+	n_name,
+	s_name,
+	p_partkey
+limit 100;
diff --git a/test_runner/performance/tpc-h/queries/20.sql b/test_runner/performance/tpc-h/queries/20.sql
new file mode 100644
index 0000000000..7e587783c5
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/20.sql
@@ -0,0 +1,44 @@
+-- $ID$
+-- TPC-H/TPC-R Potential Part Promotion Query (Q20)
+-- Function Query Definition
+-- Approved February 1998
+
+
+select
+	s_name,
+	s_address
+from
+	supplier,
+	nation
+where
+	s_suppkey in (
+		select
+			ps_suppkey
+		from
+			partsupp
+		where
+			ps_partkey in (
+				select
+					p_partkey
+				from
+					part
+				where
+					p_name like 'bisque%'
+			)
+			and ps_availqty > (
+				select
+					0.5 * sum(l_quantity)
+				from
+					lineitem
+				where
+					l_partkey = ps_partkey
+					and l_suppkey = ps_suppkey
+					and l_shipdate >= date '1997-01-01'
+					and l_shipdate < date '1997-01-01' + interval '1' year
+			)
+	)
+	and s_nationkey = n_nationkey
+	and n_name = 'ETHIOPIA'
+order by
+	s_name
+;
diff --git a/test_runner/performance/tpc-h/queries/21.sql b/test_runner/performance/tpc-h/queries/21.sql
new file mode 100644
index 0000000000..9a0a88236e
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/21.sql
@@ -0,0 +1,46 @@
+-- $ID$
+-- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	s_name,
+	count(*) as numwait
+from
+	supplier,
+	lineitem l1,
+	orders,
+	nation
+where
+	s_suppkey = l1.l_suppkey
+	and o_orderkey = l1.l_orderkey
+	and o_orderstatus = 'F'
+	and l1.l_receiptdate > l1.l_commitdate
+	and exists (
+		select
+			*
+		from
+			lineitem l2
+		where
+			l2.l_orderkey = l1.l_orderkey
+			and l2.l_suppkey <> l1.l_suppkey
+	)
+	and not exists (
+		select
+			*
+		from
+			lineitem l3
+		where
+			l3.l_orderkey = l1.l_orderkey
+			and l3.l_suppkey <> l1.l_suppkey
+			and l3.l_receiptdate > l3.l_commitdate
+	)
+	and s_nationkey = n_nationkey
+	and n_name = 'SAUDI ARABIA'
+group by
+	s_name
+order by
+	numwait desc,
+	s_name
+limit 100;
diff --git a/test_runner/performance/tpc-h/queries/22.sql b/test_runner/performance/tpc-h/queries/22.sql
new file mode 100644
index 0000000000..965239f194
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/22.sql
@@ -0,0 +1,44 @@
+-- $ID$
+-- TPC-H/TPC-R Global Sales Opportunity Query (Q22)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	cntrycode,
+	count(*) as numcust,
+	sum(c_acctbal) as totacctbal
+from
+	(
+		select
+			substring(c_phone from 1 for 2) as cntrycode,
+			c_acctbal
+		from
+			customer
+		where
+			substring(c_phone from 1 for 2) in
+				('15', '14', '29', '34', '33', '19', '13')
+			and c_acctbal > (
+				select
+					avg(c_acctbal)
+				from
+					customer
+				where
+					c_acctbal > 0.00
+					and substring(c_phone from 1 for 2) in
+						('15', '14', '29', '34', '33', '19', '13')
+			)
+			and not exists (
+				select
+					*
+				from
+					orders
+				where
+					o_custkey = c_custkey
+			)
+	) as custsale
+group by
+	cntrycode
+order by
+	cntrycode
+;
diff --git a/test_runner/performance/tpc-h/queries/3.sql b/test_runner/performance/tpc-h/queries/3.sql
new file mode 100644
index 0000000000..bbb8f7371a
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/3.sql
@@ -0,0 +1,29 @@
+-- $ID$
+-- TPC-H/TPC-R Shipping Priority Query (Q3)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	l_orderkey,
+	sum(l_extendedprice * (1 - l_discount)) as revenue,
+	o_orderdate,
+	o_shippriority
+from
+	customer,
+	orders,
+	lineitem
+where
+	c_mktsegment = 'AUTOMOBILE'
+	and c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and o_orderdate < date '1995-03-26'
+	and l_shipdate > date '1995-03-26'
+group by
+	l_orderkey,
+	o_orderdate,
+	o_shippriority
+order by
+	revenue desc,
+	o_orderdate
+limit 10;
diff --git a/test_runner/performance/tpc-h/queries/4.sql b/test_runner/performance/tpc-h/queries/4.sql
new file mode 100644
index 0000000000..098b203414
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/4.sql
@@ -0,0 +1,28 @@
+-- $ID$
+-- TPC-H/TPC-R Order Priority Checking Query (Q4)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	o_orderpriority,
+	count(*) as order_count
+from
+	orders
+where
+	o_orderdate >= date '1996-12-01'
+	and o_orderdate < date '1996-12-01' + interval '3' month
+	and exists (
+		select
+			*
+		from
+			lineitem
+		where
+			l_orderkey = o_orderkey
+			and l_commitdate < l_receiptdate
+	)
+group by
+	o_orderpriority
+order by
+	o_orderpriority
+;
diff --git a/test_runner/performance/tpc-h/queries/5.sql b/test_runner/performance/tpc-h/queries/5.sql
new file mode 100644
index 0000000000..393e17987f
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/5.sql
@@ -0,0 +1,31 @@
+-- $ID$
+-- TPC-H/TPC-R Local Supplier Volume Query (Q5)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	n_name,
+	sum(l_extendedprice * (1 - l_discount)) as revenue
+from
+	customer,
+	orders,
+	lineitem,
+	supplier,
+	nation,
+	region
+where
+	c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and l_suppkey = s_suppkey
+	and c_nationkey = s_nationkey
+	and s_nationkey = n_nationkey
+	and n_regionkey = r_regionkey
+	and r_name = 'ASIA'
+	and o_orderdate >= date '1996-01-01'
+	and o_orderdate < date '1996-01-01' + interval '1' year
+group by
+	n_name
+order by
+	revenue desc
+;
diff --git a/test_runner/performance/tpc-h/queries/6.sql b/test_runner/performance/tpc-h/queries/6.sql
new file mode 100644
index 0000000000..90ebcd4782
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/6.sql
@@ -0,0 +1,16 @@
+-- $ID$
+-- TPC-H/TPC-R Forecasting Revenue Change Query (Q6)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	sum(l_extendedprice * l_discount) as revenue
+from
+	lineitem
+where
+	l_shipdate >= date '1996-01-01'
+	and l_shipdate < date '1996-01-01' + interval '1' year
+	and l_discount between 0.02 - 0.01 and 0.02 + 0.01
+	and l_quantity < 24
+;
diff --git a/test_runner/performance/tpc-h/queries/7.sql b/test_runner/performance/tpc-h/queries/7.sql
new file mode 100644
index 0000000000..8a34724b38
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/7.sql
@@ -0,0 +1,46 @@
+-- $ID$
+-- TPC-H/TPC-R Volume Shipping Query (Q7)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	supp_nation,
+	cust_nation,
+	l_year,
+	sum(volume) as revenue
+from
+	(
+		select
+			n1.n_name as supp_nation,
+			n2.n_name as cust_nation,
+			extract(year from l_shipdate) as l_year,
+			l_extendedprice * (1 - l_discount) as volume
+		from
+			supplier,
+			lineitem,
+			orders,
+			customer,
+			nation n1,
+			nation n2
+		where
+			s_suppkey = l_suppkey
+			and o_orderkey = l_orderkey
+			and c_custkey = o_custkey
+			and s_nationkey = n1.n_nationkey
+			and c_nationkey = n2.n_nationkey
+			and (
+				(n1.n_name = 'ALGERIA' and n2.n_name = 'CANADA')
+				or (n1.n_name = 'CANADA' and n2.n_name = 'ALGERIA')
+			)
+			and l_shipdate between date '1995-01-01' and date '1996-12-31'
+	) as shipping
+group by
+	supp_nation,
+	cust_nation,
+	l_year
+order by
+	supp_nation,
+	cust_nation,
+	l_year
+;
diff --git a/test_runner/performance/tpc-h/queries/8.sql b/test_runner/performance/tpc-h/queries/8.sql
new file mode 100644
index 0000000000..f8259c960b
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/8.sql
@@ -0,0 +1,44 @@
+-- $ID$
+-- TPC-H/TPC-R National Market Share Query (Q8)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	o_year,
+	sum(case
+		when nation = 'CANADA' then volume
+		else 0
+	end) / sum(volume) as mkt_share
+from
+	(
+		select
+			extract(year from o_orderdate) as o_year,
+			l_extendedprice * (1 - l_discount) as volume,
+			n2.n_name as nation
+		from
+			part,
+			supplier,
+			lineitem,
+			orders,
+			customer,
+			nation n1,
+			nation n2,
+			region
+		where
+			p_partkey = l_partkey
+			and s_suppkey = l_suppkey
+			and l_orderkey = o_orderkey
+			and o_custkey = c_custkey
+			and c_nationkey = n1.n_nationkey
+			and n1.n_regionkey = r_regionkey
+			and r_name = 'AMERICA'
+			and s_nationkey = n2.n_nationkey
+			and o_orderdate between date '1995-01-01' and date '1996-12-31'
+			and p_type = 'SMALL POLISHED BRASS'
+	) as all_nations
+group by
+	o_year
+order by
+	o_year
+;
diff --git a/test_runner/performance/tpc-h/queries/9.sql b/test_runner/performance/tpc-h/queries/9.sql
new file mode 100644
index 0000000000..d2e2df9f00
--- /dev/null
+++ b/test_runner/performance/tpc-h/queries/9.sql
@@ -0,0 +1,39 @@
+-- $ID$
+-- TPC-H/TPC-R Product Type Profit Measure Query (Q9)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	nation,
+	o_year,
+	sum(amount) as sum_profit
+from
+	(
+		select
+			n_name as nation,
+			extract(year from o_orderdate) as o_year,
+			l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+		from
+			part,
+			supplier,
+			lineitem,
+			partsupp,
+			orders,
+			nation
+		where
+			s_suppkey = l_suppkey
+			and ps_suppkey = l_suppkey
+			and ps_partkey = l_partkey
+			and p_partkey = l_partkey
+			and o_orderkey = l_orderkey
+			and s_nationkey = n_nationkey
+			and p_name like '%firebrick%'
+	) as profit
+group by
+	nation,
+	o_year
+order by
+	nation,
+	o_year desc
+;

From 9747e90f3ac4a5c126c134d1ea07f7699923d59c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 8 Dec 2022 22:28:25 +0000
Subject: [PATCH 036/167] Nightly Benchmarks: Move from captest to staging
 (#2838)

Migrate Nightly Benchmarks from captest to staging.

- Migrate GitHub Workflows
- Replace `zenith-benchmarker` with regular runners
- Remove `environment` parameter from Neon GitHub Actions, add
`postgres_version`
- The only job left on captest is `neon-captest-reuse`, which will be
moved to staging after its project migration.

Ref https://github.com/neondatabase/cloud/issues/2836
---
 .github/actions/neon-branch-create/action.yml |  40 ++---
 .github/actions/neon-branch-delete/action.yml |  29 +---
 .../actions/neon-project-create/action.yml    |  43 ++---
 .../actions/neon-project-delete/action.yml    |  29 +---
 .github/workflows/benchmarking.yml            | 156 +++++++-----------
 .github/workflows/pg_clients.yml              |   6 +-
 6 files changed, 97 insertions(+), 206 deletions(-)

diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index 9102bcf3da..7ee43a3587 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -5,12 +5,12 @@ inputs:
   api_key:
     desctiption: 'Neon API key'
     required: true
-  environment:
-    desctiption: 'dev (aka captest) or staging'
-    required: true
   project_id:
     desctiption: 'ID of the Project to create Branch in'
     required: true
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
 outputs:
   dsn:
     description: 'Created Branch DSN (for main database)'
@@ -22,27 +22,6 @@ outputs:
 runs:
   using: "composite"
   steps:
-    - name: Parse Input
-      id: parse-input
-      shell: bash -euxo pipefail {0}
-      run: |
-        case "${ENVIRONMENT}" in
-          dev)
-            API_HOST=console.dev.neon.tech
-            ;;
-          staging)
-            API_HOST=console.stage.neon.tech
-            ;;
-          *)
-            echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
-            exit 1
-            ;;
-        esac
-
-        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
-      env:
-        ENVIRONMENT: ${{ inputs.environment }}
-
     - name: Create New Branch
       id: create-branch
       shell: bash -euxo pipefail {0}
@@ -56,7 +35,12 @@ runs:
             --data "{
               \"branch\": {
                 \"name\": \"Created by actions/neon-branch-create; GITHUB_RUN_ID=${GITHUB_RUN_ID} at $(date +%s)\"
-              }
+              },
+              \"endpoints\": [
+                {
+                  \"type\": \"read_write\"
+                }
+              ]
             }")
 
           if [ -z "${branch}" ]; then
@@ -84,8 +68,8 @@ runs:
         host=$(echo $branch | jq --raw-output '.endpoints[0].host')
         echo "host=${host}" >> $GITHUB_OUTPUT
       env:
+        API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
         PROJECT_ID: ${{ inputs.project_id }}
 
     - name: Get Role name
@@ -103,8 +87,8 @@ runs:
         role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
         echo "role_name=${role_name}" >> $GITHUB_OUTPUT
       env:
+        API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
         PROJECT_ID: ${{ inputs.project_id }}
         BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
 
@@ -146,8 +130,8 @@ runs:
         echo "::add-mask::${dsn}"
         echo "dsn=${dsn}" >> $GITHUB_OUTPUT
       env:
+        API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
         PROJECT_ID: ${{ inputs.project_id }}
         BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
         ROLE_NAME: ${{ steps.role-name.outputs.role_name }}
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index 3ca96ced11..5689093e2e 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -5,40 +5,19 @@ inputs:
   api_key:
     desctiption: 'Neon API key'
     required: true
-  environment:
-    desctiption: 'dev (aka captest) or staging'
-    required: true
   project_id:
     desctiption: 'ID of the Project which should be deleted'
     required: true
   branch_id:
     desctiption: 'ID of the branch to delete'
     required: true
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
 
 runs:
   using: "composite"
   steps:
-    - name: Parse Input
-      id: parse-input
-      shell: bash -euxo pipefail {0}
-      run: |
-        case "${ENVIRONMENT}" in
-          dev)
-            API_HOST=console.dev.neon.tech
-            ;;
-          staging)
-            API_HOST=console.stage.neon.tech
-            ;;
-          *)
-            echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
-            exit 1
-            ;;
-        esac
-
-        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
-      env:
-        ENVIRONMENT: ${{ inputs.environment }}
-
     - name: Delete Branch
       # Do not try to delete a branch if .github/actions/neon-project-create
       # or .github/actions/neon-branch-create failed before
@@ -73,7 +52,7 @@ runs:
           exit 1
         fi
       env:
+        API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
         PROJECT_ID: ${{ inputs.project_id }}
         BRANCH_ID: ${{ inputs.branch_id }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 8399d6c511..b9d9182882 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -5,12 +5,16 @@ inputs:
   api_key:
     desctiption: 'Neon API key'
     required: true
-  environment:
-    desctiption: 'dev (aka captest) or staging'
-    required: true
   region_id:
     desctiption: 'Region ID, if not set the project will be created in the default region'
-    required: false
+    default: aws-us-east-2
+  postgres_version:
+    desctiption: 'Postgres version; default is 15'
+    default: 15
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
+
 outputs:
   dsn:
     description: 'Created Project DSN (for main database)'
@@ -22,31 +26,6 @@ outputs:
 runs:
   using: "composite"
   steps:
-    - name: Parse Input
-      id: parse-input
-      shell: bash -euxo pipefail {0}
-      run: |
-        case "${ENVIRONMENT}" in
-          dev)
-            API_HOST=console.dev.neon.tech
-            REGION_ID=${REGION_ID:-aws-eu-west-1}
-            ;;
-          staging)
-            API_HOST=console.stage.neon.tech
-            REGION_ID=${REGION_ID:-aws-us-east-2}
-            ;;
-          *)
-            echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
-            exit 1
-            ;;
-        esac
-
-        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
-        echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT
-      env:
-        ENVIRONMENT: ${{ inputs.environment }}
-        REGION_ID: ${{ inputs.region_id }}
-
     - name: Create Neon Project
       id: create-neon-project
       # A shell without `set -x` to not to expose password/dsn in logs
@@ -61,6 +40,7 @@ runs:
           --data "{
             \"project\": {
               \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
+              \"pg_version\": ${POSTGRES_VERSION},
               \"region_id\": \"${REGION_ID}\",
               \"settings\": { }
             }
@@ -76,6 +56,7 @@ runs:
         project_id=$(echo $project | jq --raw-output '.project.id')
         echo "project_id=${project_id}" >> $GITHUB_OUTPUT
       env:
+        API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
-        REGION_ID: ${{ steps.parse-input.outputs.region_id }}
+        REGION_ID: ${{ inputs.region_id }}
+        POSTGRES_VERSION: ${{ inputs.postgres_version }}
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index 88b5d3fc5b..cd58c629e5 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -5,37 +5,16 @@ inputs:
   api_key:
     desctiption: 'Neon API key'
     required: true
-  environment:
-    desctiption: 'dev (aka captest) or staging'
-    required: true
   project_id:
     desctiption: 'ID of the Project to delete'
     required: true
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
 
 runs:
   using: "composite"
   steps:
-    - name: Parse Input
-      id: parse-input
-      shell: bash -euxo pipefail {0}
-      run: |
-        case "${ENVIRONMENT}" in
-          dev)
-            API_HOST=console.dev.neon.tech
-            ;;
-          staging)
-            API_HOST=console.stage.neon.tech
-            ;;
-          *)
-            echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
-            exit 1
-            ;;
-        esac
-
-        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
-      env:
-        ENVIRONMENT: ${{ inputs.environment }}
-
     - name: Delete Neon Project
       # Do not try to delete a project if .github/actions/neon-project-create failed before
       if: ${{ inputs.project_id != '' }}
@@ -49,6 +28,6 @@ runs:
           --header "Content-Type: application/json" \
           --header "Authorization: Bearer ${API_KEY}"
       env:
+        API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
         PROJECT_ID: ${{ inputs.project_id }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index eb9ba70371..a2c05a9222 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -15,9 +15,6 @@ on:
 
   workflow_dispatch: # adds ability to run this manually
     inputs:
-      environment:
-        description: 'Environment to run remote tests on (dev or staging)'
-        required: false
       region_id:
         description: 'Use a particular region. If not set the default region will be used'
         required: false
@@ -37,103 +34,69 @@ concurrency:
 
 jobs:
   bench:
-    # this workflow runs on self hosteed runner
-    # it's environment is quite different from usual guthub runner
-    # probably the most important difference is that it doesn't start from clean workspace each time
-    # e g if you install system packages they are not cleaned up since you install them directly in host machine
-    # not a container or something
-    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
-    runs-on: [self-hosted, zenith-benchmarker]
-
     env:
-      POSTGRES_DISTRIB_DIR: /usr/pgsql
+      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
+      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
 
     steps:
-    - name: Checkout zenith repo
-      uses: actions/checkout@v3
+    - uses: actions/checkout@v3
 
-    # actions/setup-python@v2 is not working correctly on self-hosted runners
-    # see https://github.com/actions/setup-python/issues/162
-    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
-    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
-    # there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
-    - name: Install poetry & deps
-      run: |
-        python3 -m pip install --upgrade poetry wheel
-        # since pip/poetry caches are reused there shouldn't be any troubles with install every time
-        ./scripts/pysync
-
-    - name: Show versions
-      run: |
-        echo Python
-        python3 --version
-        poetry run python3 --version
-        echo Poetry
-        poetry --version
-        echo Pgbench
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
 
     - name: Create Neon Project
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        environment: ${{ github.event.inputs.environment || 'staging' }}
-        api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY  || secrets.NEON_CAPTEST_API_KEY }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Run benchmark
-      # pgbench is installed system wide from official repo
-      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
-      # via
-      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
-      # [pgdg13]
-      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
-      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
-      # enabled=1
-      # gpgcheck=0
-      # EOF
-      # sudo yum makecache
-      # sudo yum install postgresql13-contrib
-      # actual binaries are located in /usr/pgsql-13/bin/
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        # Set --sparse-ordering option of pytest-order plugin
+        # to ensure tests are running in order of appears in the file.
+        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
       env:
-        # The pgbench test runs two tests of given duration against each scale.
-        # So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
-        # Plus time needed to initialize the test databases.
-        TEST_PG_BENCH_DURATIONS_MATRIX: "300"
-        TEST_PG_BENCH_SCALES_MATRIX: "10,100"
-        PLATFORM: "neon-staging"
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
-      run: |
-        # just to be sure that no data was cached on self hosted runner
-        # since it might generate duplicates when calling ingest_perf_test_result.py
-        rm -rf perf-report-staging
-        mkdir -p perf-report-staging
-        # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
-        # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests.
-        # Do not run tests from test_runner/performance/test_perf_olap.py because they require a prepared DB. We run them separately in `clickbench-compare` job.
-        ./scripts/pytest test_runner/performance/ -v \
-          -m "remote_cluster" \
-          --sparse-ordering \
-          --out-dir perf-report-staging \
-          --timeout 5400 \
-          --ignore test_runner/performance/test_perf_olap.py
-
-    - name: Submit result
-      env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-      run: |
-        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
 
     - name: Delete Neon Project
       if: ${{ always() }}
       uses: ./.github/actions/neon-project-delete
       with:
-        environment: staging
         project_id: ${{ steps.create-neon-project.outputs.project_id }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
+    - name: Create Allure report
+      if: success() || failure()
+      uses: ./.github/actions/allure-report
+      with:
+        action: generate
+        build_type: ${{ env.BUILD_TYPE }}
+
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
@@ -152,15 +115,22 @@ jobs:
         # neon-captest-prefetch: Same, with prefetching enabled (new project)
         # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
         # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-new, neon-captest-prefetch, rds-postgres ]
         db_size: [ 10gb ]
+        runner: [ us-east-2 ]
         include:
+          - platform: neon-captest-reuse
+            db_size: 10gb
+            runner: dev  # TODO: Switch to us-east-2 after dry-bonus-223539 migration to staging
           - platform: neon-captest-new
             db_size: 50gb
+            runner: us-east-2
           - platform: neon-captest-prefetch
             db_size: 50gb
+            runner: us-east-2
           - platform: rds-aurora
             db_size: 50gb
+            runner: us-east-2
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
@@ -172,9 +142,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     timeout-minutes: 360 # 6h
@@ -199,8 +169,9 @@ jobs:
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        environment: ${{ github.event.inputs.environment || 'dev' }}
-        api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY  || secrets.NEON_CAPTEST_API_KEY }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -278,6 +249,13 @@ jobs:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
 
+    - name: Delete Neon Project
+      if: ${{ steps.create-neon-project.outputs.project_id && always() }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
     - name: Create Allure report
       if: success() || failure()
       uses: ./.github/actions/allure-report
@@ -285,14 +263,6 @@ jobs:
         action: generate
         build_type: ${{ env.BUILD_TYPE }}
 
-    - name: Delete Neon Project
-      if: ${{ steps.create-neon-project.outputs.project_id && always() }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        environment: dev
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_CAPTEST_API_KEY }}
-
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
@@ -329,9 +299,9 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     timeout-minutes: 360 # 6h
@@ -440,9 +410,7 @@ jobs:
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
       PLATFORM: ${{ matrix.platform }}
 
-    # NOTE: Here we use non-standadard `captest` runner (instead of `dev`) which is located in us-east-2 region.
-    #       We will move the rest of benchmarking jobs to staging in https://github.com/neondatabase/neon/pull/2838
-    runs-on: [ self-hosted, captest, x64 ]
+    runs-on: [ self-hosted, us-east-2, x64 ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
       options: --init
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 0600f9234f..9f57519589 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -23,6 +23,7 @@ jobs:
     runs-on: [ ubuntu-latest ]
 
     env:
+      DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
 
     steps:
@@ -51,8 +52,8 @@ jobs:
       id: create-neon-project
       uses: ./.github/actions/neon-project-create
       with:
-        environment: staging
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
 
     - name: Run pytest
       env:
@@ -63,7 +64,7 @@ jobs:
       run: |
         # Test framework expects we have psql binary;
         # but since we don't really need it in this test, let's mock it
-        mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql";
+        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
         ./scripts/pytest \
           --junitxml=$TEST_OUTPUT/junit.xml \
           --tb=short \
@@ -75,7 +76,6 @@ jobs:
       if: ${{ always() }}
       uses: ./.github/actions/neon-project-delete
       with:
-        environment: staging
         project_id: ${{ steps.create-neon-project.outputs.project_id }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 

From 4752385470a706150a6fc5ee12013fe134853e66 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 9 Dec 2022 10:02:23 +0100
Subject: [PATCH 037/167] Update PostgreSQL to latest vendored releases (#3037)

Several fixes are included, with among others:

- Prefetching for index bulkdelete calls (e.g. during vacuum), plus v14 compiler warning fix
- A fix for setting LSN on heap pages while setting vm bits
- Some style updates that were lost in the previous wave (v15 only)
---
 vendor/postgres-v14 | 2 +-
 vendor/postgres-v15 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 06edb5af61..c22aea6714 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 06edb5af6180f99ee1bd6903bae2898d2ea128ef
+Subproject commit c22aea67149a2fe71cab881be7a31fba305ddc21
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index edf4c161dd..114da43a49 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit edf4c161dd0182d22c28297e841ca253bc1b8ee4
+Subproject commit 114da43a4967c068c958dacd6dedf65053c99148

From 3122f3282f1a7f4639141f5a5a451cefae53a43d Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 8 Dec 2022 15:52:02 +0200
Subject: [PATCH 038/167] Ignore backup files (ones with .n.old suffix) in
 download_missing

This is rather a hack to resolve immediate issue:
https://github.com/neondatabase/neon/issues/3024

Properly cleaning this file from index part requires changes to
initialization of remote queue. Because we need to clean it up earlier
than we start warking around files.

With on-demand there will be no walk around layer files becase
download_missing is no longer needed, so I believe it will be
natural to unify this with load_layer_map
---
 pageserver/src/tenant/timeline.rs             |  6 ++
 test_runner/fixtures/neon_fixtures.py         |  1 +
 .../test_tenants_with_remote_storage.py       | 85 +++++++++++++++++--
 3 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec8049bcea..aab5d6f1d3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1126,6 +1126,12 @@ impl Timeline {
                     .unwrap()
                     .insert_historic(Arc::new(delta_layer));
                 self.metrics.current_physical_size_gauge.add(sz);
+            } else if layer_name.ends_with(".old") {
+                // For details see https://github.com/neondatabase/neon/issues/3024
+                warn!(
+                    "got backup file on the remote storage, ignoring it {file}",
+                    file = layer_name
+                )
             } else {
                 bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}");
             }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cda518f26c..d20e591e9b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2896,6 +2896,7 @@ def assert_no_in_progress_downloads_for_tenant(
 ):
     tenant_status = pageserver_http_client.tenant_status(tenant)
     assert tenant_status["has_in_progress_downloads"] is False, tenant_status
+    assert tenant_status["state"] == "Active"
 
 
 def remote_consistent_lsn(
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 76639e4055..1228f8b86e 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -248,15 +248,6 @@ def test_tenant_upgrades_index_json_from_v0(
     # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
     env = neon_env_builder.init_start()
 
-    # FIXME: Are these expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
-    )
-    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
-    env.pageserver.allowed_errors.append(
-        ".*Failed to get local tenant state: Tenant .* not found in the local state.*"
-    )
-
     pageserver_http = env.pageserver.http_client()
     pg = env.postgres.create_start("main")
 
@@ -352,6 +343,82 @@ def test_tenant_upgrades_index_json_from_v0(
 # FIXME: test index_part.json getting downgraded from imaginary new version
 
 
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_tenant_ignores_backup_file(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    # getting a too eager compaction happening for this test would not play
+    # well with the strict assertions.
+    neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'"
+
+    neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file")
+
+    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
+    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*")
+
+    pageserver_http = env.pageserver.http_client()
+    pg = env.postgres.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # flush, wait until in remote storage
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+    env.postgres.stop_all()
+    env.pageserver.stop()
+
+    # change the remote file to have entry with .0.old suffix
+    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
+    with open(timeline_path, "r+") as timeline_file:
+        # keep the deserialized for later inspection
+        orig_index_part = json.load(timeline_file)
+        backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old"
+        orig_index_part["timeline_layers"].append(backup_layer_name)
+
+        timeline_file.seek(0)
+        json.dump(orig_index_part, timeline_file)
+
+    env.pageserver.start()
+    pageserver_http = env.pageserver.http_client()
+
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+    )
+
+    pg = env.postgres.create_start("main")
+
+    with pg.cursor() as cur:
+        cur.execute("INSERT INTO t0 VALUES (234, 'test data');")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+    # not needed anymore
+    env.postgres.stop_all()
+    env.pageserver.stop()
+
+    # file is still mentioned in the index. Removing it requires more hacking on remote queue initialization
+    # Will be easier to do once there will be no .download_missing so it will be only one cycle through the layers
+    # in load_layer_map
+    new_index_part = local_fs_index_part(env, tenant_id, timeline_id)
+    backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"])
+    assert len(list(backup_layers)) == 1
+
+
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_redownloads_truncated_file_on_startup(
     neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind

From 6c8b2af1f8a067d0c8028dc90bf32fef83b1f00d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Fri, 9 Dec 2022 11:12:42 +0200
Subject: [PATCH 039/167] Change storage brokers to internal subdomain (#3039)

There's a bit of a clash with the naming, so dedicate a subdomain for
storage brokers. Back to subdomain separation just to be consistent.
---
 .../helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml   | 4 ++--
 .../helm-values/dev-us-east-2-beta.neon-storage-broker.yaml   | 4 ++--
 .../prod-ap-southeast-1-epsilon.neon-storage-broker.yaml      | 4 ++--
 .../prod-eu-central-1-gamma.neon-storage-broker.yaml          | 4 ++--
 .../helm-values/prod-us-east-2-delta.neon-storage-broker.yaml | 4 ++--
 .../helm-values/prod-us-west-2-eta.neon-storage-broker.yaml   | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index 296785635c..d13cebead1 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -13,13 +13,13 @@ ingress:
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
-    - host: storage-broker-zeta.eu-west-1.aws.neon.build
+    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
       paths:
         - path: /
           pathType: Prefix
   tls:
     - hosts:
-        - storage-broker-zeta.eu-west-1.aws.neon.build
+        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
       secretName: storage-broker-tls
 
 
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index f197d2e579..b8b8fb055c 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -13,13 +13,13 @@ ingress:
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
-    - host: storage-broker-beta.us-east-2.aws.neon.build
+    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
       paths:
         - path: /
           pathType: Prefix
   tls:
     - hosts:
-        - storage-broker-beta.us-east-2.aws.neon.build
+        - storage-broker.beta.us-east-2.internal.aws.neon.build
       secretName: storage-broker-tls
 
 
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 959033939a..bd979e0649 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -13,13 +13,13 @@ ingress:
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
-    - host: storage-broker-epsilon.ap-southeast-1.aws.neon.tech
+    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
       paths:
         - path: /
           pathType: Prefix
   tls:
     - hosts:
-        - storage-broker-epsilon.ap-southeast-1.aws.neon.tech
+        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
       secretName: storage-broker-tls
 
 
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index 1184ff442c..79cc751c65 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -13,13 +13,13 @@ ingress:
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
-    - host: storage-broker-gamma.eu-central-1.aws.neon.tech
+    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
       paths:
         - path: /
           pathType: Prefix
   tls:
     - hosts:
-        - storage-broker-gamma.eu-central-1.aws.neon.tech
+        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
       secretName: storage-broker-tls
 
 
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index 651b87b96a..959abea20c 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -13,13 +13,13 @@ ingress:
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
-    - host: storage-broker-delta.us-east-2.aws.neon.tech
+    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
       paths:
         - path: /
           pathType: Prefix
   tls:
     - hosts:
-        - storage-broker-delta.us-east-2.aws.neon.tech
+        - storage-broker.delta.us-east-2.internal.aws.neon.tech
       secretName: storage-broker-tls
 
 
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 1c7cbfd44e..f41f87b7b7 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -13,13 +13,13 @@ ingress:
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
-    - host: storage-broker-eta.us-west-2.aws.neon.tech
+    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
       paths:
         - path: /
           pathType: Prefix
   tls:
     - hosts:
-        - storage-broker-eta.us-west-2.aws.neon.tech
+        - storage-broker.eta.us-west-2.internal.aws.neon.tech
       secretName: storage-broker-tls
 
 

From 28667ce7247083c89368c232d2cdc532d8f63aa6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 8 Dec 2022 14:21:30 +0400
Subject: [PATCH 040/167] Make safekeeper exit code 0.

We don't have any useful graceful shutdown mode, so immediate one is normal.

https://github.com/neondatabase/neon/issues/2956
---
 safekeeper/src/bin/safekeeper.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index fcd3065c65..45f0f2f5b2 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -264,10 +264,10 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
     signals.handle(|signal| {
         // TODO: implement graceful shutdown with joining threads etc
         info!(
-            "Got {}. Terminating in immediate shutdown mode",
+            "received {}, terminating in immediate shutdown mode",
             signal.name()
         );
-        std::process::exit(111);
+        std::process::exit(0);
     })
 }
 

From 3321eea679b398b4b6833c469e4f4a8322423f20 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 9 Dec 2022 14:26:05 +0100
Subject: [PATCH 041/167] Fix for #3043 (#3048)

---
 pgxn/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 76f53aae0b..73bf330baf 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -343,7 +343,7 @@ compact_prefetch_buffers(void)
 
 	if (MyPState->ring_last != empty_ring_index)
 	{
-		MyPState->ring_last = empty_ring_index;
+		prefetch_cleanup_trailing_unused();
 		return true;
 	}
 

From 8684b1b582e181c95dd1158917704911d175a50c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Fri, 9 Dec 2022 16:37:53 +0200
Subject: [PATCH 042/167] Reduce the storage-broker deployment timeout to 5
 minutes. 15 minutes is (#3047)

15 minutes is way too long, at least at this point and we want to see
the possible errors quicker. Hence drop it to 5min to have some safety
margin.
---
 .github/workflows/build_and_test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 49187ab64a..f568b7c4fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -927,7 +927,7 @@ jobs:
       - name: Deploy storage-broker
         run:
           DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 5m0s
 
   deploy-proxy-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -1013,7 +1013,7 @@ jobs:
       - name: Deploy storage-broker
         run:
           DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1091,7 +1091,7 @@ jobs:
       - name: Deploy storage-broker
         run:
           DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 4d6137e0e682d3224f1717d956e8d28f609c329b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 9 Dec 2022 21:06:15 +0400
Subject: [PATCH 043/167] Try to fix docker image tag in broker deploy.

---
 .github/workflows/build_and_test.yml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f568b7c4fd..ba70b786fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -926,8 +926,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
 
   deploy-proxy-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -1012,8 +1011,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1090,8 +1088,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 861dc8e64e5ea87555400c7904dcab78980d826e Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 9 Dec 2022 13:18:37 +0200
Subject: [PATCH 044/167] Remove redundant once_cell usages

---
 libs/postgres_ffi/wal_craft/src/lib.rs | 17 +++++++----------
 pageserver/src/task_mgr.rs             |  2 +-
 pageserver/src/tenant/block_io.rs      |  3 +--
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index c4404b37ba..feec3b2ace 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,7 +1,6 @@
 use anyhow::*;
 use core::time::Duration;
 use log::*;
-use once_cell::sync::Lazy;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -26,15 +25,13 @@ pub struct PostgresServer {
     client_config: postgres::Config,
 }
 
-pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
-    vec![
-        "wal_keep_size=50MB",            // Ensure old WAL is not removed
-        "shared_preload_libraries=neon", // can only be loaded at startup
-        // Disable background processes as much as possible
-        "wal_writer_delay=10s",
-        "autovacuum=off",
-    ]
-});
+pub static REQUIRED_POSTGRES_CONFIG: [&str; 4] = [
+    "wal_keep_size=50MB",            // Ensure old WAL is not removed
+    "shared_preload_libraries=neon", // can only be loaded at startup
+    // Disable background processes as much as possible
+    "wal_writer_delay=10s",
+    "autovacuum=off",
+];
 
 impl Conf {
     pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 86d1266f09..3462f4eb82 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -139,7 +139,7 @@ pub struct PageserverTaskId(u64);
 
 /// Each task that we track is associated with a "task ID". It's just an
 /// increasing number that we assign. Note that it is different from tokio::task::Id.
-static NEXT_TASK_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
+static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
 
 /// Global registry of tasks
 static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index bbcdabe1cd..ab36754c9e 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,6 @@
 use crate::page_cache;
 use crate::page_cache::{ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
-use once_cell::sync::Lazy;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::AtomicU64;
@@ -117,7 +116,7 @@ where
     }
 }
 
-static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
 
 /// An adapter for reading a (virtual) file using the page cache.
 ///

From b8a5664fb9500c23c1b84c9bc002a4cc7acfb1f5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 10 Dec 2022 00:35:05 +0200
Subject: [PATCH 045/167] test: kill spawned postgres (#3054)

Fixes #2604.
---
 test_runner/regress/test_compute_ctl.py | 71 ++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index 1851aeed55..74ee2a89d4 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 from subprocess import TimeoutExpired
 
 from fixtures.log_helper import log
@@ -195,18 +196,60 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         ctl_logs = exc.stderr.decode("utf-8")
         log.info("compute_ctl output:\n" + ctl_logs)
 
-    start = "starting safekeepers syncing"
-    end = "safekeepers synced at LSN"
-    start_pos = ctl_logs.index(start)
-    assert start_pos != -1
-    end_pos = ctl_logs.index(end, start_pos)
-    assert end_pos != -1
-    sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)]
-    log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs)
+    with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
+        start = "starting safekeepers syncing"
+        end = "safekeepers synced at LSN"
+        start_pos = ctl_logs.index(start)
+        assert start_pos != -1
+        end_pos = ctl_logs.index(end, start_pos)
+        assert end_pos != -1
+        sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)]
+        log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs)
 
-    # assert that --sync-safekeepers logs are present in the output
-    assert "connecting with node" in sync_safekeepers_logs
-    assert "connected with node" in sync_safekeepers_logs
-    assert "proposer connected to quorum (2)" in sync_safekeepers_logs
-    assert "got votes from majority (2)" in sync_safekeepers_logs
-    assert "sending elected msg to node" in sync_safekeepers_logs
+        # assert that --sync-safekeepers logs are present in the output
+        assert "connecting with node" in sync_safekeepers_logs
+        assert "connected with node" in sync_safekeepers_logs
+        assert "proposer connected to quorum (2)" in sync_safekeepers_logs
+        assert "got votes from majority (2)" in sync_safekeepers_logs
+        assert "sending elected msg to node" in sync_safekeepers_logs
+
+
+class ExternalProcessManager:
+    """
+    Context manager that kills a process with a pid file on exit.
+    """
+
+    def __init__(self, pid_file: Path):
+        self.path = pid_file
+        self.pid_file = open(pid_file, "r")
+        self.pid = int(self.pid_file.readline().strip())
+
+    def __enter__(self):
+        return self
+
+    def leave_alive(self):
+        self.pid_file.close()
+
+    def __exit__(self, _type, _value, _traceback):
+        import signal
+        import time
+
+        if self.pid_file.closed:
+            return
+
+        with self.pid_file:
+            try:
+                os.kill(self.pid, signal.SIGTERM)
+            except os.OsError as e:
+                if not self.path.is_file():
+                    return
+                log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}")
+                return
+
+            for _ in range(20):
+                if not self.path.is_file():
+                    return
+                time.sleep(0.2)
+
+            log.info("Process failed to stop after SIGTERM: {self.pid}")
+            os.kill(self.pid, signal.SIGKILL)

From 700a36ee6bc83b2a41c856f22e7e8afd1e6e2c21 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 10 Dec 2022 10:18:55 +0200
Subject: [PATCH 046/167] Wait for certain tenant status in the remote storage
 test (#3055)

Closes https://github.com/neondatabase/neon/issues/3052

From what I could understand from the PR, we did not wait enough before
the attach failed.
Extended the wait period a bit and put a check for a status instead of
plain `sleep` to fail if we don't get the expected status.
---
 test_runner/fixtures/neon_fixtures.py      | 21 +++++++++++++++
 test_runner/regress/test_remote_storage.py |  3 ++-
 test_runner/regress/test_tenant_detach.py  | 30 ++++------------------
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d20e591e9b..5fbde5e03b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2938,6 +2938,27 @@ def wait_for_upload(
     )
 
 
+# Does not use `wait_until` for debugging purposes
+def wait_until_tenant_state(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    expected_state: str,
+    iterations: int,
+) -> bool:
+    for _ in range(iterations):
+        try:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            log.debug(f"Tenant {tenant_id} data: {tenant}")
+            if tenant["state"] == expected_state:
+                return True
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
+
+        time.sleep(1)
+
+    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
+
+
 def last_record_lsn(
     pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 506955b1df..7152bc8b6a 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -19,6 +19,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import print_gc_result, query_scalar, wait_until
@@ -134,7 +135,7 @@ def test_remote_storage_backup_and_restore(
     client.tenant_attach(tenant_id)
 
     # is there a better way to assert that failpoint triggered?
-    time.sleep(10)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
 
     # assert cannot attach timeline that is scheduled for download
     # FIXME implement layer download retries
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0d3465cc01..59811c565c 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
     available_remote_storages,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar
@@ -230,7 +231,7 @@ def test_ignored_tenant_reattach(
 
     # now, load it from the local files and expect it works
     pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
-    wait_until_tenant_status(pageserver_http, ignored_tenant_id, "Active", 5)
+    wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
 
     tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
     tenants_after_attach.sort()
@@ -289,7 +290,7 @@ def test_ignored_tenant_download_missing_layers(
 
     # now, load it from the local files and expect it to work due to remote storage restoration
     pageserver_http.tenant_load(tenant_id=tenant_id)
-    wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 
     tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
     tenants_after_attach.sort()
@@ -340,7 +341,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
 
     # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
     pageserver_http.tenant_load(tenant_id=tenant_id)
-    wait_until_tenant_status(pageserver_http, tenant_id, "Broken", 5)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5)
 
 
 # Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
@@ -441,7 +442,7 @@ def test_ignore_while_attaching(
     # But can load it from local files, that will restore attach.
     pageserver_http.tenant_load(tenant_id)
 
-    wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 
     pg.stop()
     pg.start()
@@ -481,24 +482,3 @@ def ensure_test_data(data_id: int, data: str, pg: Postgres):
         assert (
             query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
         ), "Should have timeline data back"
-
-
-# Does not use `wait_until` for debugging purposes
-def wait_until_tenant_status(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    expected_status: str,
-    iterations: int,
-) -> bool:
-    for _ in range(iterations):
-        try:
-            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
-            log.debug(f"Tenant {tenant_id} status: {tenant}")
-            if tenant["state"] == expected_status:
-                return True
-        except Exception as e:
-            log.debug(f"Tenant {tenant_id} status retrieval failure: {e}")
-
-        time.sleep(1)
-
-    raise Exception(f"Tenant {tenant_id} did not become {expected_status} in {iterations} seconds")

From 0f445827f5c070f8adedcd5f9d56c92d0355eda2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 10 Dec 2022 23:35:05 +0000
Subject: [PATCH 047/167] test_seqscans: increase table size for remote test
 (#3057)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Increase table size four times to fix the following error:

```
______________________ test_seqscans[remote-100000-100-0] ______________________
test_runner/performance/test_seqscans.py:57: in test_seqscans
    assert int(shared_buffers) < int(table_size)
E   assert 536870912 < 181239808
E    +  where 536870912 = int(536870912)
E    +  and   181239808 = int(181239808)
```

536870912 / 181239808 ≈ 2.96
---
 test_runner/performance/test_seqscans.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py
index a0a1dbd01d..a61d64553d 100644
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -24,12 +24,12 @@ from pytest_lazyfixture import lazy_fixture  # type: ignore
 @pytest.mark.parametrize(
     "env, scale",
     [
-        # Run on all envs. Use 50x larger table on remote cluster to make sure
+        # Run on all envs. Use 200x larger table on remote cluster to make sure
         # it doesn't fit in shared buffers, which are larger on remote than local.
         pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
         pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
         pytest.param(
-            lazy_fixture("remote_compare"), 50, id="remote", marks=pytest.mark.remote_cluster
+            lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
         ),
     ],
 )

From 249d77c720e34a222a1e48516c036d146ace3b3b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 12 Dec 2022 12:57:41 +0400
Subject: [PATCH 048/167] Deploy broker with L4 LB on old envs.

To avoid having to configure MAX_CONCURRENT_STREAMS on L7 LB (as well as TLS &
public DNS).
---
 .../neon-stress.neon-storage-broker.yaml      | 28 +++++++++----------
 .../production.neon-storage-broker.yaml       | 28 +++++++++----------
 .../staging.neon-storage-broker.yaml          | 28 +++++++++----------
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml
index b141246df0..fd35c5e14e 100644
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ b/.github/helm-values/neon-stress.neon-storage-broker.yaml
@@ -3,22 +3,22 @@ podLabels:
   neon_env: neon-stress
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: alb
-    alb.ingress.kubernetes.io/healthcheck-path: /status
-    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
-    alb.ingress.kubernetes.io/scheme: "internal"
-    alb.ingress.kubernetes.io/target-type: "ip"
-    alb.ingress.kubernetes.io/ssl-redirect: "443"
-    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
-  hosts:
-    - host: storage-broker-stress.stage.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
+ingress:
+  enabled: false
 
 metrics:
   enabled: true
diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml
index 299d6fa89e..395b023671 100644
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ b/.github/helm-values/production.neon-storage-broker.yaml
@@ -3,22 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: alb
-    alb.ingress.kubernetes.io/healthcheck-path: /status
-    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
-    alb.ingress.kubernetes.io/scheme: "internal"
-    alb.ingress.kubernetes.io/target-type: "ip"
-    alb.ingress.kubernetes.io/ssl-redirect: "443"
-    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
-  hosts:
-    - host: storage-broker.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
+ingress:
+  enabled: false
 
 metrics:
   enabled: true
diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml
index 54e1e1bba2..bffcf41ef0 100644
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ b/.github/helm-values/staging.neon-storage-broker.yaml
@@ -3,22 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: alb
-    alb.ingress.kubernetes.io/healthcheck-path: /status
-    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
-    alb.ingress.kubernetes.io/scheme: "internal"
-    alb.ingress.kubernetes.io/target-type: "ip"
-    alb.ingress.kubernetes.io/ssl-redirect: "443"
-    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
-  hosts:
-    - host: storage-broker.stage.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
+ingress:
+  enabled: false
 
 metrics:
   enabled: true

From 32662ff1c42a1f36001eddc9fdf60a087fc92e71 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Sep 2022 13:44:28 +0300
Subject: [PATCH 049/167] Replace etcd with storage_broker.

This is the replacement itself, the binary landed earlier. See
docs/storage_broker.md.

ref
https://github.com/neondatabase/neon/pull/2466
https://github.com/neondatabase/neon/issues/2394
---
 .github/ansible/neon-stress.hosts.yaml        |   2 +-
 .../ansible/prod.ap-southeast-1.hosts.yaml    |   2 +-
 .github/ansible/prod.eu-central-1.hosts.yaml  |   2 +-
 .github/ansible/prod.us-east-2.hosts.yaml     |   2 +-
 .github/ansible/production.hosts.yaml         |   2 +-
 .github/ansible/staging.eu-west-1.hosts.yaml  |   2 +-
 .github/ansible/staging.hosts.yaml            |   2 +-
 .github/ansible/staging.us-east-2.hosts.yaml  |   2 +-
 .github/ansible/systemd/pageserver.service    |   2 +-
 .github/ansible/systemd/safekeeper.service    |   2 +-
 .github/workflows/build_and_test.yml          |   3 +-
 Cargo.lock                                    | 901 ++++++++----------
 Dockerfile                                    |   2 +-
 README.md                                     |  14 +-
 control_plane/Cargo.toml                      |   2 +
 control_plane/simple.conf                     |   4 +-
 control_plane/src/bin/neon_local.rs           |  31 +-
 control_plane/src/broker.rs                   |  48 +
 control_plane/src/etcd.rs                     |  78 --
 control_plane/src/lib.rs                      |   2 +-
 control_plane/src/local_env.rs                |  85 +-
 control_plane/src/pageserver.rs               |  63 +-
 control_plane/src/safekeeper.rs               |   9 +-
 docker-compose/docker-compose.yml             |  56 +-
 docs/authentication.md                        |   4 +-
 docs/docker.md                                |   4 +-
 docs/settings.md                              |  18 +-
 libs/etcd_broker/Cargo.toml                   |  18 -
 libs/etcd_broker/src/lib.rs                   | 209 ----
 libs/etcd_broker/src/subscription_key.rs      | 310 ------
 libs/etcd_broker/src/subscription_value.rs    |  38 -
 libs/safekeeper_api/src/models.rs             |  37 +
 pageserver/Cargo.toml                         |   2 +-
 pageserver/src/bin/pageserver.rs              |   2 +-
 pageserver/src/config.rs                      |  96 +-
 pageserver/src/task_mgr.rs                    |   4 +-
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/walreceiver.rs                 |  50 +-
 .../src/walreceiver/connection_manager.rs     | 534 +++--------
 safekeeper/Cargo.toml                         |   3 +-
 safekeeper/src/bin/safekeeper.rs              |  43 +-
 safekeeper/src/broker.rs                      | 201 ++--
 safekeeper/src/http/routes.rs                 |  23 +-
 safekeeper/src/lib.rs                         |  10 +-
 safekeeper/src/safekeeper.rs                  |  53 +-
 safekeeper/src/timeline.rs                    |  60 +-
 storage_broker/Cargo.toml                     |   4 +-
 storage_broker/benches/rps.rs                 |  16 +-
 storage_broker/src/lib.rs                     |  28 +-
 test_runner/README.md                         |   2 -
 test_runner/fixtures/neon_fixtures.py         |  86 +-
 test_runner/fixtures/utils.py                 |   8 -
 test_runner/regress/test_compatibility.py     |  71 +-
 test_runner/regress/test_tenant_relocation.py |   6 +-
 test_runner/regress/test_wal_acceptor.py      |  12 +-
 workspace_hack/Cargo.toml                     |   7 +-
 56 files changed, 1064 insertions(+), 2222 deletions(-)
 create mode 100644 control_plane/src/broker.rs
 delete mode 100644 control_plane/src/etcd.rs
 delete mode 100644 libs/etcd_broker/Cargo.toml
 delete mode 100644 libs/etcd_broker/src/lib.rs
 delete mode 100644 libs/etcd_broker/src/subscription_key.rs
 delete mode 100644 libs/etcd_broker/src/subscription_value.rs

diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml
index dd61ac5a5e..6b2166e7a6 100644
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-storage-ireland
     bucket_region: eu-west-1
     console_mgmt_base_url: http://neon-stress-console.local
-    etcd_endpoints: neon-stress-etcd.local:2379
+    broker_endpoint: http://storage-broker.neon-stress.local:50051
     safekeeper_enable_s3_offload: 'false'
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index bb4af91f71..76ec3d29ae 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
     console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
+    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 68b1579746..c8a8b15ddb 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
     console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
+    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 1d54e2ef0a..36a5337a8d 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
+    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index 63b293f3e3..cea0556ba1 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -4,7 +4,7 @@ storage:
     console_mgmt_base_url: http://console-release.local
     bucket_name: zenith-storage-oregon
     bucket_region: us-west-2
-    etcd_endpoints: zenith-1-etcd.local:2379
+    broker_endpoint: http://storage-broker.prod.local:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 088ba03e5e..4a64423a0d 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
     console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
+    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml
index ae55f9223c..a580b7563a 100644
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: zenith-staging-storage-us-east-1
     bucket_region: us-east-1
     console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
+    broker_endpoint: http://storage-broker.staging.local:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 26a82f8db4..5a5a673a5e 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
+    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service
index 653e2dc142..9847ee0f9e 100644
--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=pageserver
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
-ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
+ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service
index 7eaed100d8..828655e435 100644
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ba70b786fd..30ceac1af1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -888,7 +888,8 @@ jobs:
           helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
           helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
-  deploy-storage-broker-staging:
+  deploy-storage-broker:
+    name: deploy storage broker on old staging and old prod
     runs-on: [ self-hosted, dev, x64 ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
diff --git a/Cargo.lock b/Cargo.lock
index 12ab6f17aa..913b39da0f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -30,9 +30,9 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "0.7.19"
+version = "0.7.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e"
+checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
 dependencies = [
  "memchr",
 ]
@@ -59,9 +59,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anyhow"
-version = "1.0.65"
+version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602"
+checksum = "216261ddc8289130e551ddcd5ce8a064710c0d064a4d2895c67151c92b5443f6"
 dependencies = [
  "backtrace",
 ]
@@ -134,9 +134,9 @@ dependencies = [
 
 [[package]]
 name = "async-trait"
-version = "0.1.57"
+version = "0.1.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f"
+checksum = "31e6e93155431f3931513b243d371981bb2770112b370c82745a1d19d2f99364"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -145,9 +145,9 @@ dependencies = [
 
 [[package]]
 name = "atomic-polyfill"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89"
+checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
 dependencies = [
  "critical-section",
 ]
@@ -158,7 +158,7 @@ version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.1.19",
  "libc",
  "winapi",
 ]
@@ -491,16 +491,16 @@ dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
  "http",
- "rustc_version 0.4.0",
+ "rustc_version",
  "tracing",
  "zeroize",
 ]
 
 [[package]]
 name = "axum"
-version = "0.5.16"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9e3356844c4d6a6d6467b8da2cffb4a2820be256f50a3a386c9d152bab31043"
+checksum = "08b108ad2665fa3f6e6a517c3d80ec3e77d224c47d605167aefaa5d7ef97fa48"
 dependencies = [
  "async-trait",
  "axum-core",
@@ -516,9 +516,9 @@ dependencies = [
  "mime",
  "percent-encoding",
  "pin-project-lite",
+ "rustversion",
  "serde",
  "sync_wrapper",
- "tokio",
  "tower",
  "tower-http",
  "tower-layer",
@@ -527,9 +527,9 @@ dependencies = [
 
 [[package]]
 name = "axum-core"
-version = "0.2.8"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b"
+checksum = "79b8558f5a0581152dc94dcd289132a1d377494bdeafcd41869b3258e3e2ad92"
 dependencies = [
  "async-trait",
  "bytes",
@@ -537,6 +537,7 @@ dependencies = [
  "http",
  "http-body",
  "mime",
+ "rustversion",
  "tower-layer",
  "tower-service",
 ]
@@ -556,26 +557,11 @@ dependencies = [
  "rustc-demangle",
 ]
 
-[[package]]
-name = "bare-metal"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3"
-dependencies = [
- "rustc_version 0.2.3",
-]
-
-[[package]]
-name = "bare-metal"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603"
-
 [[package]]
 name = "base64"
-version = "0.13.0"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
+checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
 
 [[package]]
 name = "bincode"
@@ -608,18 +594,6 @@ dependencies = [
  "which",
 ]
 
-[[package]]
-name = "bit_field"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4"
-
-[[package]]
-name = "bitfield"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719"
-
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -649,15 +623,15 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.11.0"
+version = "3.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d"
+checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
 
 [[package]]
 name = "bytemuck"
-version = "1.12.1"
+version = "1.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da"
+checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f"
 
 [[package]]
 name = "byteorder"
@@ -667,9 +641,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.2.1"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db"
+checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c"
 dependencies = [
  "serde",
 ]
@@ -692,9 +666,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.0.73"
+version = "1.0.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
+checksum = "e9f73505338f7d905b19d18738976aae232eb46b8efc15554ffc56deb5d9ebe4"
 
 [[package]]
 name = "cexpr"
@@ -770,9 +744,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "3.2.22"
+version = "3.2.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750"
+checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
 dependencies = [
  "bitflags",
  "clap_lex 0.2.4",
@@ -782,14 +756,14 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.0.15"
+version = "4.0.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bf8832993da70a4c6d13c581f4463c2bdda27b9bf1c5498dc4365543abe6d6f"
+checksum = "4d63b9e9c07271b9957ad22c173bae2a4d9a81127680962039296abcd2f8251d"
 dependencies = [
- "atty",
  "bitflags",
  "clap_derive",
  "clap_lex 0.3.0",
+ "is-terminal",
  "once_cell",
  "strsim",
  "termcolor",
@@ -797,9 +771,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.0.13"
+version = "4.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c42f169caba89a7d512b5418b09864543eeb4d497416c917d7137863bd2076ad"
+checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -836,15 +810,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "cmake"
-version = "0.1.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "codespan-reporting"
 version = "0.11.1"
@@ -867,9 +832,9 @@ dependencies = [
 
 [[package]]
 name = "comfy-table"
-version = "6.1.0"
+version = "6.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85914173c2f558d61613bfbbf1911f14e630895087a7ed2fafc0f5319e1536e7"
+checksum = "e621e7e86c46fd8a14c32c6ae3cb95656621b4743a27d0cffedb831d46e7ad21"
 dependencies = [
  "crossterm",
  "strum",
@@ -883,7 +848,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.0.15",
+ "clap 4.0.29",
  "env_logger",
  "futures",
  "hyper",
@@ -925,10 +890,10 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.15",
+ "clap 4.0.29",
  "comfy-table",
  "git-version",
- "nix 0.25.0",
+ "nix 0.25.1",
  "once_cell",
  "pageserver_api",
  "postgres",
@@ -938,6 +903,7 @@ dependencies = [
  "safekeeper_api",
  "serde",
  "serde_with",
+ "storage_broker",
  "tar",
  "thiserror",
  "toml",
@@ -962,18 +928,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
 
-[[package]]
-name = "cortex-m"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0"
-dependencies = [
- "bare-metal 0.2.5",
- "bitfield",
- "embedded-hal",
- "volatile-register",
-]
-
 [[package]]
 name = "cpp_demangle"
 version = "0.3.5"
@@ -998,7 +952,7 @@ version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
 dependencies = [
- "rustc_version 0.4.0",
+ "rustc_version",
 ]
 
 [[package]]
@@ -1020,7 +974,7 @@ dependencies = [
  "atty",
  "cast",
  "ciborium",
- "clap 3.2.22",
+ "clap 3.2.23",
  "criterion-plot",
  "itertools",
  "lazy_static",
@@ -1048,15 +1002,9 @@ dependencies = [
 
 [[package]]
 name = "critical-section"
-version = "0.2.7"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd"
-dependencies = [
- "bare-metal 1.0.0",
- "cfg-if",
- "cortex-m",
- "riscv",
-]
+checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
 
 [[package]]
 name = "crossbeam-channel"
@@ -1081,14 +1029,14 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.11"
+version = "0.9.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348"
+checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a"
 dependencies = [
  "autocfg",
  "cfg-if",
  "crossbeam-utils",
- "memoffset 0.6.5",
+ "memoffset 0.7.1",
  "scopeguard",
 ]
 
@@ -1139,9 +1087,9 @@ dependencies = [
 
 [[package]]
 name = "cxx"
-version = "1.0.79"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f83d0ebf42c6eafb8d7c52f7e5f2d3003b89c7aa4fd2b79229209459a849af8"
+checksum = "bdf07d07d6531bfcdbe9b8b739b104610c6508dcc4d63b410585faf338241daf"
 dependencies = [
  "cc",
  "cxxbridge-flags",
@@ -1151,9 +1099,9 @@ dependencies = [
 
 [[package]]
 name = "cxx-build"
-version = "1.0.79"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07d050484b55975889284352b0ffc2ecbda25c0c55978017c132b29ba0818a86"
+checksum = "d2eb5b96ecdc99f72657332953d4d9c50135af1bac34277801cc3937906ebd39"
 dependencies = [
  "cc",
  "codespan-reporting",
@@ -1166,15 +1114,15 @@ dependencies = [
 
 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.79"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99d2199b00553eda8012dfec8d3b1c75fce747cf27c169a270b3b99e3448ab78"
+checksum = "ac040a39517fd1674e0f32177648334b0f4074625b5588a64519804ba0553b12"
 
 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.79"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcb67a6de1f602736dd7eaead0080cf3435df806c61b24b13328db128c58868f"
+checksum = "1362b0ddcfc4eb0a1f57b68bd77dd99f0e826958a96abd0ae9bd092e114ffed6"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1183,9 +1131,9 @@ dependencies = [
 
 [[package]]
 name = "darling"
-version = "0.14.1"
+version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4529658bdda7fd6769b8614be250cdcfc3aeb0ee72fe66f9e41e5e5eb73eac02"
+checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -1193,9 +1141,9 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.14.1"
+version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "649c91bc01e8b1eac09fb91e8dbc7d517684ca6be8ebc75bb9cafc894f9fdb6f"
+checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f"
 dependencies = [
  "fnv",
  "ident_case",
@@ -1207,9 +1155,9 @@ dependencies = [
 
 [[package]]
 name = "darling_macro"
-version = "0.14.1"
+version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddfc69c5bfcbd2fc09a0f38451d2daf0e372e367986a83906d1b0dbc88134fb5"
+checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e"
 dependencies = [
  "darling_core",
  "quote",
@@ -1218,9 +1166,9 @@ dependencies = [
 
 [[package]]
 name = "data-encoding"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57"
+checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
 
 [[package]]
 name = "debugid"
@@ -1238,7 +1186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
 dependencies = [
  "serde",
- "uuid 1.2.1",
+ "uuid 1.2.2",
 ]
 
 [[package]]
@@ -1257,9 +1205,9 @@ dependencies = [
 
 [[package]]
 name = "digest"
-version = "0.10.5"
+version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c"
+checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
 dependencies = [
  "block-buffer",
  "crypto-common",
@@ -1283,16 +1231,6 @@ version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
 
-[[package]]
-name = "embedded-hal"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff"
-dependencies = [
- "nb 0.1.3",
- "void",
-]
-
 [[package]]
 name = "encoding_rs"
 version = "0.8.31"
@@ -1304,9 +1242,9 @@ dependencies = [
 
 [[package]]
 name = "env_logger"
-version = "0.9.1"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c90bf5f19754d10198ccb95b70664fc925bd1fc090a0fd9a6ebc54acc8cd6272"
+checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
 dependencies = [
  "atty",
  "humantime",
@@ -1316,36 +1254,24 @@ dependencies = [
 ]
 
 [[package]]
-name = "etcd-client"
-version = "0.9.2"
+name = "errno"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf"
+checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
 dependencies = [
- "http",
- "prost 0.10.4",
- "tokio",
- "tokio-stream",
- "tonic 0.7.2",
- "tonic-build 0.7.2",
- "tower",
- "tower-service",
+ "errno-dragonfly",
+ "libc",
+ "winapi",
 ]
 
 [[package]]
-name = "etcd_broker"
-version = "0.1.0"
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
 dependencies = [
- "etcd-client",
- "once_cell",
- "regex",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tracing",
- "utils",
- "workspace_hack",
+ "cc",
+ "libc",
 ]
 
 [[package]]
@@ -1376,14 +1302,14 @@ dependencies = [
 
 [[package]]
 name = "filetime"
-version = "0.2.17"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94a7bbaa59354bc20dd75b67f23e2797b4490e9d6928203fb105c79e448c86c"
+checksum = "4b9663d381d07ae25dc88dbdf27df458faa83a9b25336bcac83d5e452b5fc9d3"
 dependencies = [
  "cfg-if",
  "libc",
  "redox_syscall",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
@@ -1455,9 +1381,9 @@ dependencies = [
 
 [[package]]
 name = "futures"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c"
+checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1470,9 +1396,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050"
+checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -1480,15 +1406,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf"
+checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab"
+checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -1497,15 +1423,15 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68"
+checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17"
+checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1514,15 +1440,15 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56"
+checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9"
 
 [[package]]
 name = "futures-task"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1"
+checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea"
 
 [[package]]
 name = "futures-timer"
@@ -1532,9 +1458,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
 
 [[package]]
 name = "futures-util"
-version = "0.3.24"
+version = "0.3.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90"
+checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -1560,9 +1486,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.7"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
+checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
 dependencies = [
  "cfg-if",
  "libc",
@@ -1605,9 +1531,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
 
 [[package]]
 name = "h2"
-version = "0.3.14"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be"
+checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4"
 dependencies = [
  "bytes",
  "fnv",
@@ -1654,7 +1580,7 @@ checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
 dependencies = [
  "atomic-polyfill",
  "hash32",
- "rustc_version 0.4.0",
+ "rustc_version",
  "spin 0.9.4",
  "stable_deref_trait",
 ]
@@ -1674,6 +1600,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "hermit-abi"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "hex"
 version = "0.4.3"
@@ -1767,9 +1702,9 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "0.14.20"
+version = "0.14.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac"
+checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -1791,9 +1726,9 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.23.0"
+version = "0.23.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac"
+checksum = "59df7c4e19c950e6e0e868dcc0a300b09a9b88e9ec55bd879ca819087a77355d"
 dependencies = [
  "http",
  "hyper",
@@ -1831,9 +1766,9 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.51"
+version = "0.1.53"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f5a6ef98976b22b3b7f2f3a806f858cb862044cfa66805aa3ad84cb3d3b785ed"
+checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
@@ -1871,9 +1806,9 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "1.9.1"
+version = "1.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
+checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399"
 dependencies = [
  "autocfg",
  "hashbrown",
@@ -1928,10 +1863,32 @@ dependencies = [
 ]
 
 [[package]]
-name = "ipnet"
-version = "2.5.0"
+name = "io-lifetimes"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b"
+checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+dependencies = [
+ "libc",
+ "windows-sys 0.42.0",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f88c5561171189e69df9d98bcf18fd5f9558300f7ea7b801eb8a0fd748bd8745"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "927609f78c2913a6f6ac3c27a4fe87f43e2a35367c0c4b0f8265e8f49a104330"
+dependencies = [
+ "hermit-abi 0.2.6",
+ "io-lifetimes",
+ "rustix",
+ "windows-sys 0.42.0",
+]
 
 [[package]]
 name = "itertools"
@@ -1959,9 +1916,9 @@ dependencies = [
 
 [[package]]
 name = "jsonwebtoken"
-version = "8.1.1"
+version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c"
+checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
 dependencies = [
  "base64",
  "pem",
@@ -2005,15 +1962,15 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.135"
+version = "0.2.138"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c"
+checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8"
 
 [[package]]
 name = "libloading"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd"
+checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
 dependencies = [
  "cfg-if",
  "winapi",
@@ -2021,9 +1978,9 @@ dependencies = [
 
 [[package]]
 name = "libm"
-version = "0.2.5"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565"
+checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
 
 [[package]]
 name = "link-cplusplus"
@@ -2034,6 +1991,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f9f08d8963a6c613f4b1a78f4f4a4dbfadf8e6545b2d72861731e4858b8b47f"
+
 [[package]]
 name = "lock_api"
 version = "0.4.9"
@@ -2071,9 +2034,9 @@ dependencies = [
 
 [[package]]
 name = "matchit"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb"
+checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
 
 [[package]]
 name = "md-5"
@@ -2098,9 +2061,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
 [[package]]
 name = "memmap2"
-version = "0.5.7"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95af15f345b17af2efc8ead6080fb8bc376f8cec1b35277b935637595fe77498"
+checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc"
 dependencies = [
  "libc",
 ]
@@ -2156,14 +2119,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf"
+checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de"
 dependencies = [
  "libc",
  "log",
  "wasi",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
@@ -2190,26 +2153,11 @@ dependencies = [
  "tempfile",
 ]
 
-[[package]]
-name = "nb"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f"
-dependencies = [
- "nb 1.0.0",
-]
-
-[[package]]
-name = "nb"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae"
-
 [[package]]
 name = "nix"
-version = "0.23.1"
+version = "0.23.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6"
+checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
 dependencies = [
  "bitflags",
  "cc",
@@ -2220,9 +2168,9 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.25.0"
+version = "0.25.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb"
+checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
 dependencies = [
  "autocfg",
  "bitflags",
@@ -2283,9 +2231,9 @@ dependencies = [
 
 [[package]]
 name = "num-format"
-version = "0.4.3"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54b862ff8df690cf089058c98b183676a7ed0f974cc08b426800093227cbff3b"
+checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
 dependencies = [
  "arrayvec",
  "itoa",
@@ -2313,20 +2261,11 @@ dependencies = [
 
 [[package]]
 name = "num_cpus"
-version = "1.13.1"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
-dependencies = [
- "hermit-abi",
- "libc",
-]
-
-[[package]]
-name = "num_threads"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
+checksum = "f6058e64324c71e02bc2b150e4f3bc8286db6c83092132ffa3f6b1eab0f9def5"
 dependencies = [
+ "hermit-abi 0.1.19",
  "libc",
 ]
 
@@ -2350,9 +2289,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
+checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860"
 
 [[package]]
 name = "oorandom"
@@ -2362,9 +2301,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
 [[package]]
 name = "openssl"
-version = "0.10.43"
+version = "0.10.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "020433887e44c27ff16365eaa2d380547a94544ad509aff6eb5b6e3e0b27b376"
+checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2394,9 +2333,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.78"
+version = "0.9.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07d5c8cb6e57b3a3612064d7b18b117912b4ce70955c2504d4b741c9e244b132"
+checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4"
 dependencies = [
  "autocfg",
  "cc",
@@ -2418,9 +2357,9 @@ dependencies = [
 
 [[package]]
 name = "os_str_bytes"
-version = "6.3.0"
+version = "6.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff"
+checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
 
 [[package]]
 name = "overload"
@@ -2439,13 +2378,12 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.0.15",
+ "clap 4.0.29",
  "close_fds",
  "const_format",
  "crc32c",
  "criterion",
  "crossbeam-utils",
- "etcd_broker",
  "fail",
  "futures",
  "git-version",
@@ -2456,7 +2394,7 @@ dependencies = [
  "hyper",
  "itertools",
  "metrics",
- "nix 0.25.0",
+ "nix 0.25.1",
  "num-traits",
  "once_cell",
  "pageserver_api",
@@ -2477,6 +2415,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "signal-hook",
+ "storage_broker",
  "svg_fmt",
  "tar",
  "tempfile",
@@ -2526,7 +2465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
  "lock_api",
- "parking_lot_core 0.9.3",
+ "parking_lot_core 0.9.5",
 ]
 
 [[package]]
@@ -2545,15 +2484,15 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.3"
+version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929"
+checksum = "7ff9f3fef3968a3ec5945535ed654cb38ff72d7495a25619e2247fb15a2ed9ba"
 dependencies = [
  "cfg-if",
  "libc",
  "redox_syscall",
  "smallvec",
- "windows-sys",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
@@ -2761,7 +2700,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "log",
- "nix 0.23.1",
+ "nix 0.23.2",
  "parking_lot 0.11.2",
  "symbolic-demangle",
  "tempfile",
@@ -2770,9 +2709,9 @@ dependencies = [
 
 [[package]]
 name = "ppv-lite86"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "pq_proto"
@@ -2840,22 +2779,22 @@ dependencies = [
 
 [[package]]
 name = "procfs"
-version = "0.12.0"
+version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0941606b9934e2d98a3677759a971756eb821f75764d0e0d26946d08e74d9104"
+checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
 dependencies = [
  "bitflags",
  "byteorder",
  "hex",
  "lazy_static",
- "libc",
+ "rustix",
 ]
 
 [[package]]
 name = "prometheus"
-version = "0.13.2"
+version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45c8babc29389186697fe5a2a4859d697825496b83db5d0b65271cdc0488e88c"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
 dependencies = [
  "cfg-if",
  "fnv",
@@ -2869,51 +2808,19 @@ dependencies = [
 
 [[package]]
 name = "prost"
-version = "0.10.4"
+version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e"
+checksum = "c0b18e655c21ff5ac2084a5ad0611e827b3f92badf79f4910b5a5c58f4d87ff0"
 dependencies = [
  "bytes",
- "prost-derive 0.10.1",
-]
-
-[[package]]
-name = "prost"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a"
-dependencies = [
- "bytes",
- "prost-derive 0.11.2",
+ "prost-derive",
 ]
 
 [[package]]
 name = "prost-build"
-version = "0.10.4"
+version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ae5a4388762d5815a9fc0dea33c56b021cdc8dde0c55e0c9ca57197254b0cab"
-dependencies = [
- "bytes",
- "cfg-if",
- "cmake",
- "heck",
- "itertools",
- "lazy_static",
- "log",
- "multimap",
- "petgraph",
- "prost 0.10.4",
- "prost-types 0.10.1",
- "regex",
- "tempfile",
- "which",
-]
-
-[[package]]
-name = "prost-build"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511"
+checksum = "e330bf1316db56b12c2bcfa399e8edddd4821965ea25ddb2c134b610b1c1c604"
 dependencies = [
  "bytes",
  "heck",
@@ -2923,27 +2830,14 @@ dependencies = [
  "multimap",
  "petgraph",
  "prettyplease",
- "prost 0.11.2",
- "prost-types 0.11.2",
+ "prost",
+ "prost-types",
  "regex",
  "syn",
  "tempfile",
  "which",
 ]
 
-[[package]]
-name = "prost-derive"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc"
-dependencies = [
- "anyhow",
- "itertools",
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "prost-derive"
 version = "0.11.2"
@@ -2957,16 +2851,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "prost-types"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68"
-dependencies = [
- "bytes",
- "prost 0.10.4",
-]
-
 [[package]]
 name = "prost-types"
 version = "0.11.2"
@@ -2974,7 +2858,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a"
 dependencies = [
  "bytes",
- "prost 0.11.2",
+ "prost",
 ]
 
 [[package]]
@@ -2987,7 +2871,7 @@ dependencies = [
  "base64",
  "bstr",
  "bytes",
- "clap 4.0.15",
+ "clap 4.0.29",
  "futures",
  "git-version",
  "hashbrown",
@@ -3022,7 +2906,7 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "utils",
- "uuid 1.2.1",
+ "uuid 1.2.2",
  "workspace_hack",
  "x509-parser",
 ]
@@ -3087,11 +2971,10 @@ dependencies = [
 
 [[package]]
 name = "rayon"
-version = "1.5.3"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
+checksum = "1e060280438193c554f654141c9ea9417886713b7acd75974c85b18a69a88e0b"
 dependencies = [
- "autocfg",
  "crossbeam-deque",
  "either",
  "rayon-core",
@@ -3099,9 +2982,9 @@ dependencies = [
 
 [[package]]
 name = "rayon-core"
-version = "1.9.3"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
+checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3"
 dependencies = [
  "crossbeam-channel",
  "crossbeam-deque",
@@ -3132,9 +3015,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.6.0"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
+checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -3152,9 +3035,9 @@ dependencies = [
 
 [[package]]
 name = "regex-syntax"
-version = "0.6.27"
+version = "0.6.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
+checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
 
 [[package]]
 name = "remote_storage"
@@ -3191,9 +3074,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.11.12"
+version = "0.11.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc"
+checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c"
 dependencies = [
  "base64",
  "bytes",
@@ -3255,27 +3138,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "riscv"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba"
-dependencies = [
- "bare-metal 1.0.0",
- "bit_field",
- "riscv-target",
-]
-
-[[package]]
-name = "riscv-target"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222"
-dependencies = [
- "lazy_static",
- "regex",
-]
-
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -3309,7 +3171,7 @@ dependencies = [
  "futures",
  "futures-timer",
  "rstest_macros",
- "rustc_version 0.4.0",
+ "rustc_version",
 ]
 
 [[package]]
@@ -3321,7 +3183,7 @@ dependencies = [
  "cfg-if",
  "proc-macro2",
  "quote",
- "rustc_version 0.4.0",
+ "rustc_version",
  "syn",
 ]
 
@@ -3337,22 +3199,13 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
-[[package]]
-name = "rustc_version"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
-dependencies = [
- "semver 0.9.0",
-]
-
 [[package]]
 name = "rustc_version"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
 dependencies = [
- "semver 1.0.14",
+ "semver",
 ]
 
 [[package]]
@@ -3365,10 +3218,24 @@ dependencies = [
 ]
 
 [[package]]
-name = "rustls"
-version = "0.20.6"
+name = "rustix"
+version = "0.36.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aab8ee6c7097ed6057f43c187a62418d0c05a4bd5f18b3571db50ee0f9ce033"
+checksum = "a3807b5d10909833d3e9acd1eb5fb988f79376ff10fce42937de71a449c4c588"
+dependencies = [
+ "bitflags",
+ "errno",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.42.0",
+]
+
+[[package]]
+name = "rustls"
+version = "0.20.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "539a2bfe908f471bfa933876bd1eb6a19cf2176d375f82ef7f99530a40e48c2c"
 dependencies = [
  "log",
  "ring",
@@ -3423,20 +3290,20 @@ name = "safekeeper"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-stream",
  "async-trait",
  "byteorder",
  "bytes",
- "clap 4.0.15",
+ "clap 4.0.29",
  "const_format",
  "crc32c",
- "etcd_broker",
  "fs2",
  "git-version",
  "hex",
  "humantime",
  "hyper",
  "metrics",
- "nix 0.25.0",
+ "nix 0.25.1",
  "once_cell",
  "parking_lot 0.12.1",
  "postgres",
@@ -3450,6 +3317,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "signal-hook",
+ "storage_broker",
  "tempfile",
  "thiserror",
  "tokio",
@@ -3488,7 +3356,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2"
 dependencies = [
  "lazy_static",
- "windows-sys",
+ "windows-sys 0.36.1",
 ]
 
 [[package]]
@@ -3536,32 +3404,17 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "semver"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
-dependencies = [
- "semver-parser",
-]
-
 [[package]]
 name = "semver"
 version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4"
 
-[[package]]
-name = "semver-parser"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
-
 [[package]]
 name = "sentry"
-version = "0.29.0"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6425e2a14006415449fb0a3e9a119df5032f59e7a2d9350cf8738eca290dfc5"
+checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
 dependencies = [
  "httpdate",
  "native-tls",
@@ -3576,9 +3429,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-backtrace"
-version = "0.29.0"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04d79c194e5c20fe602e81faf39f3cff0f275ec61283f437a892cfd6544da592"
+checksum = "afe4800806552aab314129761d5d3b3d422284eca3de2ab59e9fd133636cbd3d"
 dependencies = [
  "backtrace",
  "once_cell",
@@ -3588,23 +3441,23 @@ dependencies = [
 
 [[package]]
 name = "sentry-contexts"
-version = "0.29.0"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1c2a57601eeb870521cc241caee27e57a012f297ece3c1b7eee87f2a531edb5"
+checksum = "a42938426670f6e7974989cd1417837a96dd8bbb01567094f567d6acb360bf88"
 dependencies = [
  "hostname",
  "libc",
  "os_info",
- "rustc_version 0.4.0",
+ "rustc_version",
  "sentry-core",
  "uname",
 ]
 
 [[package]]
 name = "sentry-core"
-version = "0.29.0"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8be90ea119c6d0664c8ab534013bc9e90355e7004d782d5d1492ca513393b929"
+checksum = "4df9b9d8de2658a1ecd4e45f7b06c80c5dd97b891bfbc7c501186189b7e9bbdf"
 dependencies = [
  "once_cell",
  "rand",
@@ -3615,9 +3468,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-panic"
-version = "0.29.0"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec217c3290e3f0d128154da731c28efa8f62cf8e3c3a006fd4bc3407c959176"
+checksum = "0af37b8500f273e511ebd6eb0d342ff7937d64ce3f134764b2b4653112d48cb4"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -3625,9 +3478,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-types"
-version = "0.29.0"
+version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67ad85f0addf16310a1fbcf3facc7acb17ef5dbf6ae059d2f3c38442a471404d"
+checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6"
 dependencies = [
  "debugid 0.8.0",
  "getrandom",
@@ -3637,23 +3490,23 @@ dependencies = [
  "thiserror",
  "time",
  "url",
- "uuid 1.2.1",
+ "uuid 1.2.2",
 ]
 
 [[package]]
 name = "serde"
-version = "1.0.145"
+version = "1.0.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b"
+checksum = "256b9932320c590e707b94576e3cc1f7c9024d0ee6612dfbcf1cb106cbe8e055"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.145"
+version = "1.0.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c"
+checksum = "b4eae9b04cbffdfd550eb462ed33bc6a1b68c935127d008b27444d08380f94e4"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3662,9 +3515,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.86"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074"
+checksum = "020ff22c755c2ed3f8cf162dbb41a7268d934702f3ed3631656ea597e08fc3db"
 dependencies = [
  "itoa",
  "ryu",
@@ -3685,9 +3538,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "2.0.1"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368f2d60d049ea019a84dcd6687b0d1e0030fe663ae105039bdf967ed5e6a9a7"
+checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef"
 dependencies = [
  "base64",
  "chrono",
@@ -3701,9 +3554,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "2.0.1"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ccadfacf6cf10faad22bbadf55986bdd0856edfb5d9210aa1dcf1f516e84e93"
+checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa"
 dependencies = [
  "darling",
  "proc-macro2",
@@ -3846,9 +3699,11 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
 name = "storage_broker"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "async-stream",
  "bytes",
- "clap 4.0.15",
+ "clap 4.0.29",
+ "const_format",
  "futures",
  "futures-core",
  "futures-util",
@@ -3858,11 +3713,11 @@ dependencies = [
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
- "prost 0.11.2",
+ "prost",
  "tokio",
  "tokio-stream",
- "tonic 0.8.2",
- "tonic-build 0.8.2",
+ "tonic",
+ "tonic-build",
  "tracing",
  "utils",
  "workspace_hack",
@@ -3946,9 +3801,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "1.0.102"
+version = "1.0.105"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
+checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4016,9 +3871,9 @@ dependencies = [
 
 [[package]]
 name = "textwrap"
-version = "0.15.1"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16"
+checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
 [[package]]
 name = "thiserror"
@@ -4051,22 +3906,30 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.15"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d634a985c4d4238ec39cacaed2e7ae552fbd3c476b552c1deac3021b7d7eaf0c"
+checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376"
 dependencies = [
  "itoa",
- "libc",
- "num_threads",
  "serde",
+ "time-core",
  "time-macros",
 ]
 
 [[package]]
-name = "time-macros"
-version = "0.2.4"
+name = "time-core"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792"
+checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd"
+
+[[package]]
+name = "time-macros"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2"
+dependencies = [
+ "time-core",
+]
 
 [[package]]
 name = "tinytemplate"
@@ -4125,9 +3988,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "1.8.0"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484"
+checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4239,9 +4102,9 @@ dependencies = [
 
 [[package]]
 name = "tonic"
-version = "0.7.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb"
+checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4257,41 +4120,12 @@ dependencies = [
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
- "prost 0.10.4",
- "prost-derive 0.10.1",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
-]
-
-[[package]]
-name = "tonic"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost 0.11.2",
- "prost-derive 0.11.2",
+ "prost",
+ "prost-derive",
+ "rustls-native-certs",
+ "rustls-pemfile",
  "tokio",
+ "tokio-rustls",
  "tokio-stream",
  "tokio-util",
  "tower",
@@ -4303,26 +4137,13 @@ dependencies = [
 
 [[package]]
 name = "tonic-build"
-version = "0.7.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1"
+checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
 dependencies = [
  "prettyplease",
  "proc-macro2",
- "prost-build 0.10.4",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "tonic-build"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc"
-dependencies = [
- "prettyplease",
- "proc-macro2",
- "prost-build 0.11.2",
+ "prost-build",
  "quote",
  "syn",
 ]
@@ -4349,9 +4170,9 @@ dependencies = [
 
 [[package]]
 name = "tower-http"
-version = "0.3.4"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba"
+checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858"
 dependencies = [
  "bitflags",
  "bytes",
@@ -4472,9 +4293,9 @@ checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
 
 [[package]]
 name = "typenum"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
+checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
 
 [[package]]
 name = "uname"
@@ -4572,7 +4393,7 @@ dependencies = [
  "hyper",
  "jsonwebtoken",
  "metrics",
- "nix 0.25.0",
+ "nix 0.25.1",
  "once_cell",
  "pq_proto",
  "rand",
@@ -4604,9 +4425,9 @@ checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
 
 [[package]]
 name = "uuid"
-version = "1.2.1"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83"
+checksum = "422ee0de9031b5b948b97a8fc04e3aa35230001a722ddd27943e0be31564ce4c"
 dependencies = [
  "getrandom",
  "serde",
@@ -4618,12 +4439,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcell"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002"
-
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -4636,27 +4451,12 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
-[[package]]
-name = "void"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
-
-[[package]]
-name = "volatile-register"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6"
-dependencies = [
- "vcell",
-]
-
 [[package]]
 name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.15",
+ "clap 4.0.29",
  "env_logger",
  "log",
  "once_cell",
@@ -4836,43 +4636,100 @@ version = "0.36.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
 dependencies = [
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_msvc",
+ "windows_aarch64_msvc 0.36.1",
+ "windows_i686_gnu 0.36.1",
+ "windows_i686_msvc 0.36.1",
+ "windows_x86_64_gnu 0.36.1",
+ "windows_x86_64_msvc 0.36.1",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc 0.42.0",
+ "windows_i686_gnu 0.42.0",
+ "windows_i686_msvc 0.42.0",
+ "windows_x86_64_gnu 0.42.0",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc 0.42.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.36.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.36.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.36.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.36.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.36.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5"
+
 [[package]]
 name = "winreg"
 version = "0.10.1"
@@ -4889,7 +4746,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "bytes",
- "clap 4.0.15",
+ "clap 4.0.29",
  "crossbeam-utils",
  "either",
  "fail",
@@ -4905,14 +4762,14 @@ dependencies = [
  "num-bigint",
  "num-integer",
  "num-traits",
- "prost 0.10.4",
- "prost 0.11.2",
+ "prost",
  "rand",
  "regex",
  "regex-syntax",
  "reqwest",
  "scopeguard",
  "serde",
+ "socket2",
  "stable_deref_trait",
  "syn",
  "tokio",
diff --git a/Dockerfile b/Dockerfile
index f0244fa8d3..0d5ba73456 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -79,7 +79,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
 RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
     && /usr/local/bin/pageserver -D /data/.neon/ --init \
        -c "id=1234" \
-       -c "broker_endpoints=['http://etcd:2379']" \
+       -c "broker_endpoint='http://storage_broker:50051'" \
        -c "pg_distrib_dir='/usr/local/'" \
        -c "listen_pg_addr='0.0.0.0:6400'" \
        -c "listen_http_addr='0.0.0.0:9898'"
diff --git a/README.md b/README.md
index c31bac6446..30bde949a9 100644
--- a/README.md
+++ b/README.md
@@ -26,12 +26,12 @@ See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more inf
 * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client protobuf-compiler
+libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
-  libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib protobuf-compiler
+  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
 ```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -44,7 +44,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd openssl flex bison
+brew install protobuf openssl flex bison
 ```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -123,12 +123,12 @@ Stopped pageserver 1 process with pid 2545906
 
 # start pageserver and safekeeper
 > ./target/debug/neon_local start
-Starting etcd broker using "/usr/bin/etcd"
-etcd started, pid: 2545996
+Starting neon broker at 127.0.0.1:50051
+storage_broker started, pid: 2918372
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
-pageserver started, pid: 2546005
+pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
-safekeeper 1 started, pid: 2546041
+safekeeper 1 started, pid: 2918437
 
 # start postgres compute node
 > ./target/debug/neon_local pg start main
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 2ab48fa76c..00b34aafb1 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -25,5 +25,7 @@ url = "2.2.2"
 pageserver_api = { path = "../libs/pageserver_api" }
 postgres_connection = { path = "../libs/postgres_connection" }
 safekeeper_api = { path = "../libs/safekeeper_api" }
+# Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
+storage_broker = { version = "0.1", path = "../storage_broker" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/control_plane/simple.conf b/control_plane/simple.conf
index ae60657400..6014e8dffd 100644
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -10,5 +10,5 @@ id = 1
 pg_port = 5454
 http_port = 7676
 
-[etcd_broker]
-broker_endpoints = ['http://127.0.0.1:2379']
+[broker]
+listen_addr = '127.0.0.1:50051'
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 99ddae862d..6f059d535e 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,10 +8,10 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::compute::ComputeControlPlane;
-use control_plane::local_env::{EtcdBroker, LocalEnv};
+use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::{etcd, local_env};
+use control_plane::{broker, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
     DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
@@ -22,9 +22,10 @@ use safekeeper_api::{
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
 use std::collections::{BTreeSet, HashMap};
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
+use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use utils::{
     auth::{Claims, Scope},
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -41,13 +42,12 @@ project_git_version!(GIT_VERSION);
 
 const DEFAULT_PG_VERSION: &str = "14";
 
-fn default_conf(etcd_binary_path: &Path) -> String {
+fn default_conf() -> String {
     format!(
         r#"
 # Default built-in configuration, defined in main.rs
-[etcd_broker]
-broker_endpoints = ['http://localhost:2379']
-etcd_binary_path = '{etcd_binary_path}'
+[broker]
+listen_addr = '{DEFAULT_BROKER_ADDR}'
 
 [pageserver]
 id = {DEFAULT_PAGESERVER_ID}
@@ -60,7 +60,6 @@ id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
 "#,
-        etcd_binary_path = etcd_binary_path.display(),
         pageserver_auth_type = AuthType::Trust,
     )
 }
@@ -298,7 +297,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
         })?
     } else {
         // Built-in default config
-        default_conf(&EtcdBroker::locate_etcd()?)
+        default_conf()
     };
 
     let pg_version = init_match
@@ -807,14 +806,14 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }
 
 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    etcd::start_etcd_process(env)?;
+    broker::start_broker_process(env)?;
     let pageserver = PageServerNode::from_env(env);
 
     // Postgres nodes are not started automatically
 
     if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
         eprintln!("pageserver start failed: {e}");
-        try_stop_etcd_process(env);
+        try_stop_storage_broker_process(env);
         exit(1);
     }
 
@@ -822,7 +821,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start() {
             eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
-            try_stop_etcd_process(env);
+            try_stop_storage_broker_process(env);
             exit(1);
         }
     }
@@ -854,14 +853,14 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
         }
     }
 
-    try_stop_etcd_process(env);
+    try_stop_storage_broker_process(env);
 
     Ok(())
 }
 
-fn try_stop_etcd_process(env: &local_env::LocalEnv) {
-    if let Err(e) = etcd::stop_etcd_process(env) {
-        eprintln!("etcd stop failed: {e}");
+fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
+    if let Err(e) = broker::stop_broker_process(env) {
+        eprintln!("neon broker stop failed: {e}");
     }
 }
 
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
new file mode 100644
index 0000000000..bd60580012
--- /dev/null
+++ b/control_plane/src/broker.rs
@@ -0,0 +1,48 @@
+use anyhow::Context;
+
+use std::path::PathBuf;
+
+use crate::{background_process, local_env};
+
+pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    let broker = &env.broker;
+    let listen_addr = &broker.listen_addr;
+
+    print!("Starting neon broker at {}", listen_addr);
+
+    let args = [format!("--listen-addr={listen_addr}")];
+
+    let client = reqwest::blocking::Client::new();
+    background_process::start_process(
+        "storage_broker",
+        &env.base_data_dir,
+        &env.storage_broker_bin(),
+        &args,
+        [],
+        background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
+        || {
+            let url = broker.client_url();
+            let status_url = url.join("status").with_context(|| {
+                format!("Failed to append /status path to broker endpoint {url}",)
+            })?;
+            let request = client
+                .get(status_url)
+                .build()
+                .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
+            match client.execute(request) {
+                Ok(resp) => Ok(resp.status().is_success()),
+                Err(_) => Ok(false),
+            }
+        },
+    )
+    .context("Failed to spawn storage_broker subprocess")?;
+    Ok(())
+}
+
+pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+    background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
+}
+
+fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
+    env.base_data_dir.join("storage_broker.pid")
+}
diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs
deleted file mode 100644
index 031ffa539b..0000000000
--- a/control_plane/src/etcd.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-use std::{fs, path::PathBuf};
-
-use anyhow::Context;
-
-use crate::{background_process, local_env};
-
-pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    let etcd_broker = &env.etcd_broker;
-    print!(
-        "Starting etcd broker using {:?}",
-        etcd_broker.etcd_binary_path
-    );
-
-    let etcd_data_dir = env.base_data_dir.join("etcd");
-    fs::create_dir_all(&etcd_data_dir)
-        .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
-
-    let client_urls = etcd_broker.comma_separated_endpoints();
-    let args = [
-        format!("--data-dir={}", etcd_data_dir.display()),
-        format!("--listen-client-urls={client_urls}"),
-        format!("--advertise-client-urls={client_urls}"),
-        // Set --quota-backend-bytes to keep the etcd virtual memory
-        // size smaller. Our test etcd clusters are very small.
-        // See https://github.com/etcd-io/etcd/issues/7910
-        "--quota-backend-bytes=100000000".to_string(),
-        // etcd doesn't compact (vacuum) with default settings,
-        // enable it to prevent space exhaustion.
-        "--auto-compaction-mode=revision".to_string(),
-        "--auto-compaction-retention=1".to_string(),
-    ];
-
-    let pid_file_path = etcd_pid_file_path(env);
-
-    let client = reqwest::blocking::Client::new();
-
-    background_process::start_process(
-        "etcd",
-        &etcd_data_dir,
-        &etcd_broker.etcd_binary_path,
-        &args,
-        [],
-        background_process::InitialPidFile::Create(&pid_file_path),
-        || {
-            for broker_endpoint in &etcd_broker.broker_endpoints {
-                let request = broker_endpoint
-                    .join("health")
-                    .with_context(|| {
-                        format!(
-                            "Failed to append /health path to broker endopint {}",
-                            broker_endpoint
-                        )
-                    })
-                    .and_then(|url| {
-                        client.get(&url.to_string()).build().with_context(|| {
-                            format!("Failed to construct request to etcd endpoint {url}")
-                        })
-                    })?;
-                if client.execute(request).is_ok() {
-                    return Ok(true);
-                }
-            }
-
-            Ok(false)
-        },
-    )
-    .context("Failed to spawn etcd subprocess")?;
-
-    Ok(())
-}
-
-pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
-}
-
-fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
-    env.base_data_dir.join("etcd.pid")
-}
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index 7c1007b133..6829479ad5 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -8,8 +8,8 @@
 //
 
 mod background_process;
+pub mod broker;
 pub mod compute;
-pub mod etcd;
 pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index ac4ebd0d1e..ed9e467eee 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -4,12 +4,16 @@
 //! script which will use local paths.
 
 use anyhow::{bail, ensure, Context};
+
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
+use std::net::IpAddr;
+use std::net::Ipv4Addr;
+use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use utils::{
@@ -62,7 +66,7 @@ pub struct LocalEnv {
     #[serde(default)]
     pub private_key_path: PathBuf,
 
-    pub etcd_broker: EtcdBroker,
+    pub broker: NeonBroker,
 
     pub pageserver: PageServerConf,
 
@@ -78,67 +82,26 @@ pub struct LocalEnv {
     branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
 
-/// Etcd broker config for cluster internal communication.
-#[serde_as]
+/// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-pub struct EtcdBroker {
-    /// A prefix to all to any key when pushing/polling etcd from a node.
-    #[serde(default)]
-    pub broker_etcd_prefix: Option<String>,
-
-    /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
-    #[serde(default)]
-    #[serde_as(as = "Vec<DisplayFromStr>")]
-    pub broker_endpoints: Vec<Url>,
-
-    /// Etcd binary path to use.
-    #[serde(default)]
-    pub etcd_binary_path: PathBuf,
+#[serde(default)]
+pub struct NeonBroker {
+    /// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'.
+    pub listen_addr: SocketAddr,
 }
 
-impl EtcdBroker {
-    pub fn locate_etcd() -> anyhow::Result<PathBuf> {
-        let which_output = Command::new("which")
-            .arg("etcd")
-            .output()
-            .context("Failed to run 'which etcd' command")?;
-        let stdout = String::from_utf8_lossy(&which_output.stdout);
-        ensure!(
-            which_output.status.success(),
-            "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
-            which_output.status,
-            String::from_utf8_lossy(&which_output.stderr)
-        );
-
-        let etcd_path = PathBuf::from(stdout.trim());
-        ensure!(
-            etcd_path.is_file(),
-            "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
-            etcd_path.display()
-        );
-
-        Ok(etcd_path)
+// Dummy Default impl to satisfy Deserialize derive.
+impl Default for NeonBroker {
+    fn default() -> Self {
+        NeonBroker {
+            listen_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0),
+        }
     }
+}
 
-    pub fn comma_separated_endpoints(&self) -> String {
-        self.broker_endpoints
-            .iter()
-            .map(|url| {
-                // URL by default adds a '/' path at the end, which is not what etcd CLI wants.
-                let url_string = url.as_str();
-                if url_string.ends_with('/') {
-                    &url_string[0..url_string.len() - 1]
-                } else {
-                    url_string
-                }
-            })
-            .fold(String::new(), |mut comma_separated_urls, url| {
-                if !comma_separated_urls.is_empty() {
-                    comma_separated_urls.push(',');
-                }
-                comma_separated_urls.push_str(url);
-                comma_separated_urls
-            })
+impl NeonBroker {
+    pub fn client_url(&self) -> Url {
+        Url::parse(&format!("http://{}", self.listen_addr)).expect("failed to construct url")
     }
 }
 
@@ -234,6 +197,10 @@ impl LocalEnv {
         self.neon_distrib_dir.join("safekeeper")
     }
 
+    pub fn storage_broker_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("storage_broker")
+    }
+
     pub fn pg_data_dirs_path(&self) -> PathBuf {
         self.base_data_dir.join("pgdatadirs").join("tenants")
     }
@@ -511,8 +478,8 @@ mod tests {
             "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
         );
 
-        let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
-        let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
+        let string_to_replace = "listen_addr = '127.0.0.1:50051'";
+        let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
         let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
         assert!(
             spoiled_url_toml.contains(spoiled_url_str),
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index d845c9d7e9..51e540e39c 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -96,13 +96,8 @@ impl PageServerNode {
         }
     }
 
-    pub fn initialize(
-        &self,
-        create_tenant: Option<TenantId>,
-        initial_timeline_id: Option<TimelineId>,
-        config_overrides: &[&str],
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
+    // pageserver conf overrides defined by neon_local configuration.
+    fn neon_local_overrides(&self) -> Vec<String> {
         let id = format!("id={}", self.env.pageserver.id);
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
@@ -117,41 +112,32 @@ impl PageServerNode {
         );
         let listen_pg_addr_param =
             format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
-        let broker_endpoints_param = format!(
-            "broker_endpoints=[{}]",
-            self.env
-                .etcd_broker
-                .broker_endpoints
-                .iter()
-                .map(|url| format!("'{url}'"))
-                .collect::<Vec<_>>()
-                .join(",")
-        );
-        let broker_etcd_prefix_param = self
-            .env
-            .etcd_broker
-            .broker_etcd_prefix
-            .as_ref()
-            .map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
+        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
-        let mut init_config_overrides = config_overrides.to_vec();
-        init_config_overrides.push(&id);
-        init_config_overrides.push(&pg_distrib_dir_param);
-        init_config_overrides.push(&authg_type_param);
-        init_config_overrides.push(&listen_http_addr_param);
-        init_config_overrides.push(&listen_pg_addr_param);
-        init_config_overrides.push(&broker_endpoints_param);
-
-        if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
-            init_config_overrides.push(broker_etcd_prefix_param);
-        }
+        let mut overrides = vec![
+            id,
+            pg_distrib_dir_param,
+            authg_type_param,
+            listen_http_addr_param,
+            listen_pg_addr_param,
+            broker_endpoint_param,
+        ];
 
         if self.env.pageserver.auth_type != AuthType::Trust {
-            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
+            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
         }
+        overrides
+    }
 
+    pub fn initialize(
+        &self,
+        create_tenant: Option<TenantId>,
+        initial_timeline_id: Option<TimelineId>,
+        config_overrides: &[&str],
+        pg_version: u32,
+    ) -> anyhow::Result<TimelineId> {
         let mut pageserver_process = self
-            .start_node(&init_config_overrides, &self.env.base_data_dir, true)
+            .start_node(config_overrides, &self.env.base_data_dir, true)
             .with_context(|| {
                 format!(
                     "Failed to start a process for pageserver {}",
@@ -224,6 +210,9 @@ impl PageServerNode {
         datadir: &Path,
         update_config: bool,
     ) -> anyhow::Result<Child> {
+        let mut overrides = self.neon_local_overrides();
+        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+
         print!(
             "Starting pageserver at '{}' in '{}'",
             self.pg_connection_config.raw_address(),
@@ -242,7 +231,7 @@ impl PageServerNode {
             args.push("--update-config");
         }
 
-        for config_override in config_overrides {
+        for config_override in &overrides {
             args.extend(["-c", config_override]);
         }
 
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 583d9709d0..4c0812a5e3 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -131,13 +131,8 @@ impl SafekeeperNode {
             args.push("--no-sync");
         }
 
-        let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
-        if !comma_separated_endpoints.is_empty() {
-            args.extend(["--broker-endpoints", &comma_separated_endpoints]);
-        }
-        if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
-            args.extend(["--broker-etcd-prefix", prefix]);
-        }
+        let broker_endpoint = format!("{}", self.env.broker.client_url());
+        args.extend(["--broker-endpoint", &broker_endpoint]);
 
         let mut backup_threads = String::new();
         if let Some(threads) = self.conf.backup_threads {
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 61b53dba41..b24cb80ce4 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,29 +1,6 @@
 version: '3'
 
 services:
-  etcd:
-    restart: always
-    image: quay.io/coreos/etcd:v3.5.4
-    ports:
-      - 2379:2379
-      - 2380:2380
-    environment:
-      # This signifficantly speeds up etcd and we anyway don't data persistency there.
-      ETCD_UNSAFE_NO_FSYNC: "1"
-    command:
-      - "etcd"
-      - "--auto-compaction-mode=revision"
-      - "--auto-compaction-retention=1"
-      - "--name=etcd-cluster"
-      - "--initial-cluster-state=new"
-      - "--initial-cluster-token=etcd-cluster-1"
-      - "--initial-cluster=etcd-cluster=http://etcd:2380"
-      - "--initial-advertise-peer-urls=http://etcd:2380"
-      - "--advertise-client-urls=http://etcd:2379"
-      - "--listen-client-urls=http://0.0.0.0:2379"
-      - "--listen-peer-urls=http://0.0.0.0:2380"
-      - "--quota-backend-bytes=134217728" # 128 MB
-
   minio:
     restart: always
     image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
@@ -56,7 +33,7 @@ services:
     restart: always
     image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
     environment:
-      - BROKER_ENDPOINT='http://etcd:2379'
+      - BROKER_ENDPOINT='http://storage_broker:50051'
       - AWS_ACCESS_KEY_ID=minio
       - AWS_SECRET_ACCESS_KEY=password
       #- RUST_BACKTRACE=1
@@ -68,7 +45,7 @@ services:
       - "-c"
     command:
       - "/usr/local/bin/pageserver -D /data/.neon/
-                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"broker_endpoint=$$BROKER_ENDPOINT\"
                                    -c \"listen_pg_addr='0.0.0.0:6400'\"
                                    -c \"listen_http_addr='0.0.0.0:9898'\"
                                    -c \"remote_storage={endpoint='http://minio:9000',
@@ -76,7 +53,7 @@ services:
                                                         bucket_region='eu-north-1',
                                                         prefix_in_bucket='/pageserver/'}\""
     depends_on:
-      - etcd
+      - storage_broker
       - minio_create_buckets
 
   safekeeper1:
@@ -85,7 +62,7 @@ services:
     environment:
       - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
       - SAFEKEEPER_ID=1
-      - BROKER_ENDPOINT=http://etcd:2379
+      - BROKER_ENDPOINT=http://storage_broker:50051
       - AWS_ACCESS_KEY_ID=minio
       - AWS_SECRET_ACCESS_KEY=password
       #- RUST_BACKTRACE=1
@@ -99,14 +76,14 @@ services:
       - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
                     --listen-http='0.0.0.0:7676'
                     --id=$$SAFEKEEPER_ID
-                    --broker-endpoints=$$BROKER_ENDPOINT
+                    --broker-endpoint=$$BROKER_ENDPOINT
                     -D /data
                     --remote-storage=\"{endpoint='http://minio:9000',
                                         bucket_name='neon',
                                         bucket_region='eu-north-1',
                                         prefix_in_bucket='/safekeeper/'}\""
     depends_on:
-      - etcd
+      - storage_broker
       - minio_create_buckets
 
   safekeeper2:
@@ -115,7 +92,7 @@ services:
     environment:
       - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
       - SAFEKEEPER_ID=2
-      - BROKER_ENDPOINT=http://etcd:2379
+      - BROKER_ENDPOINT=http://storage_broker:50051
       - AWS_ACCESS_KEY_ID=minio
       - AWS_SECRET_ACCESS_KEY=password
       #- RUST_BACKTRACE=1
@@ -129,14 +106,14 @@ services:
       - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
                     --listen-http='0.0.0.0:7676'
                     --id=$$SAFEKEEPER_ID
-                    --broker-endpoints=$$BROKER_ENDPOINT
+                    --broker-endpoint=$$BROKER_ENDPOINT
                     -D /data
                     --remote-storage=\"{endpoint='http://minio:9000',
                                         bucket_name='neon',
                                         bucket_region='eu-north-1',
                                         prefix_in_bucket='/safekeeper/'}\""
     depends_on:
-      - etcd
+      - storage_broker
       - minio_create_buckets
 
   safekeeper3:
@@ -145,7 +122,7 @@ services:
     environment:
       - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
       - SAFEKEEPER_ID=3
-      - BROKER_ENDPOINT=http://etcd:2379
+      - BROKER_ENDPOINT=http://storage_broker:50051
       - AWS_ACCESS_KEY_ID=minio
       - AWS_SECRET_ACCESS_KEY=password
       #- RUST_BACKTRACE=1
@@ -159,16 +136,25 @@ services:
       - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
                     --listen-http='0.0.0.0:7676'
                     --id=$$SAFEKEEPER_ID
-                    --broker-endpoints=$$BROKER_ENDPOINT
+                    --broker-endpoint=$$BROKER_ENDPOINT
                     -D /data
                     --remote-storage=\"{endpoint='http://minio:9000',
                                         bucket_name='neon',
                                         bucket_region='eu-north-1',
                                         prefix_in_bucket='/safekeeper/'}\""
     depends_on:
-      - etcd
+      - storage_broker
       - minio_create_buckets
 
+  storage_broker:
+    restart: always
+    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
+    ports:
+      - 50051:50051
+    command:
+      - "storage_broker"
+      - "--listen-addr=0.0.0.0:50051"
+
   compute:
     restart: always
     build:
diff --git a/docs/authentication.md b/docs/authentication.md
index c5c4f02833..0752fae19f 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -2,7 +2,7 @@
 
 ### Overview
 We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL).
-Etcd currently has no authentication.
+storage_broker currently has no authentication.
 Authentication is optional and is disabled by default for easier debugging.
 It is used in some tests, though.
 Note that we do not cover authentication with `pg.neon.tech` here.
@@ -84,7 +84,7 @@ the scope is the tenant and the token is usually passed through the
 Pageserver keeps track of multiple tenants, each having multiple timelines.
 For each timeline, it connects to the corresponding Safekeeper.
 Information about "corresponding Safekeeper" is published by Safekeepers
-in the Etcd, but they do not publish access tokens, otherwise what is
+in the storage_broker, but they do not publish access tokens, otherwise what is
 the point of authentication.
 
 Pageserver keeps a connection to some set of Safekeepers, which
diff --git a/docs/docker.md b/docs/docker.md
index 42f0048e6f..d264a1a748 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -23,9 +23,9 @@ We build all images after a successful `release` tests run and push automaticall
 
 You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
 
-- etcd x 1
 - pageserver x 1
 - safekeeper x 3
+- storage_broker x 1
 - compute x 1
 - MinIO x 1        # This is Amazon S3 compatible object storage
 
@@ -41,7 +41,7 @@ $ cd docker-compose/docker-compose.yml
 $ docker-compose down   # remove the conainers if exists
 $ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
-Creating dockercompose_etcd3_1 ...
+Creating docker-compose_storage_broker_1       ... done
 (...omit...)
 ```
 
diff --git a/docs/settings.md b/docs/settings.md
index 878681fce1..58d32157a3 100644
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -10,7 +10,6 @@ the values in the config file, if any are specified for the same key and get int
 
 ```toml
 # Initial configuration file created by 'pageserver --init'
-
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
 
@@ -25,13 +24,12 @@ max_file_descriptors = '100'
 # initial superuser role name to use when creating a new tenant
 initial_superuser_name = 'cloud_admin'
 
-broker_etcd_prefix = 'neon'
-broker_endpoints = ['some://etcd']
+broker_endpoint = 'http://127.0.0.1:50051'
 
 # [remote_storage]
 ```
 
-The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user,
+The config above shows default values for all basic pageserver settings, besides `broker_endpoint`: that one has to be set by the user,
 see the corresponding section below.
 Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
 Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
@@ -50,16 +48,10 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage=
 
 Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
 
-#### broker_endpoints
+#### broker_endpoint
 
-A list of endpoints (etcd currently) to connect and pull the information from.
-Mandatory, does not have a default, since requires etcd to be started as a separate process,
-and its connection url should be specified separately.
-
-#### broker_etcd_prefix
-
-A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
-Default is `neon`.
+A storage broker endpoint to connect and pull the information from. Default is
+`'http://127.0.0.1:50051'`. 
 
 #### checkpoint_distance
 
diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml
deleted file mode 100644
index b18dcbe5a3..0000000000
--- a/libs/etcd_broker/Cargo.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-[package]
- name = "etcd_broker"
- version = "0.1.0"
- edition = "2021"
-
- [dependencies]
- etcd-client = "0.9.0"
- regex = "1.4.5"
- serde = { version = "1.0", features = ["derive"] }
- serde_json = "1"
- serde_with = "2.0"
- once_cell = "1.13.0"
-
- utils = { path = "../utils" }
- workspace_hack = { version = "0.1", path = "../../workspace_hack" }
- tokio = "1"
- tracing = "0.1"
- thiserror = "1"
diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs
deleted file mode 100644
index 8f698977a9..0000000000
--- a/libs/etcd_broker/src/lib.rs
+++ /dev/null
@@ -1,209 +0,0 @@
-//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
-//! Intended to connect services to each other, not to store their data.
-
-/// All broker keys, that are used when dealing with etcd.
-pub mod subscription_key;
-/// All broker values, possible to use when dealing with etcd.
-pub mod subscription_value;
-
-use std::str::FromStr;
-
-use serde::de::DeserializeOwned;
-
-use subscription_key::SubscriptionKey;
-use tokio::{sync::mpsc, task::JoinHandle};
-use tracing::*;
-
-use crate::subscription_key::SubscriptionFullKey;
-
-pub use etcd_client::*;
-
-/// Default value to use for prefixing to all etcd keys with.
-/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
-pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
-
-/// A way to control the data retrieval from a certain subscription.
-pub struct BrokerSubscription<V> {
-    /// An unbounded channel to fetch the relevant etcd updates from.
-    pub value_updates: mpsc::UnboundedReceiver<BrokerUpdate<V>>,
-    key: SubscriptionKey,
-    /// A subscription task handle, to allow waiting on it for the task to complete.
-    /// Both the updates channel and the handle require `&mut`, so it's better to keep
-    /// both `pub` to allow using both in the same structures without borrow checker complaining.
-    pub watcher_handle: JoinHandle<Result<(), BrokerError>>,
-    watcher: Watcher,
-}
-
-impl<V> BrokerSubscription<V> {
-    /// Cancels the subscription, stopping the data poller and waiting for it to shut down.
-    pub async fn cancel(mut self) -> Result<(), BrokerError> {
-        self.watcher.cancel().await.map_err(|e| {
-            BrokerError::EtcdClient(
-                e,
-                format!("Failed to cancel broker subscription, kind: {:?}", self.key),
-            )
-        })?;
-        match (&mut self.watcher_handle).await {
-            Ok(res) => res,
-            Err(e) => {
-                if e.is_cancelled() {
-                    // don't error on the tasks that are cancelled already
-                    Ok(())
-                } else {
-                    Err(BrokerError::InternalError(format!(
-                        "Panicked during broker subscription task, kind: {:?}, error: {e}",
-                        self.key
-                    )))
-                }
-            }
-        }
-    }
-}
-
-impl<V> Drop for BrokerSubscription<V> {
-    fn drop(&mut self) {
-        // we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped,
-        // no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task.
-        self.watcher_handle.abort();
-    }
-}
-
-/// An update from the etcd broker.
-pub struct BrokerUpdate<V> {
-    /// Etcd generation version, the bigger the more actual the data is.
-    pub etcd_version: i64,
-    /// Etcd key for the corresponding value, parsed from the broker KV.
-    pub key: SubscriptionFullKey,
-    /// Current etcd value, parsed from the broker KV.
-    pub value: V,
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum BrokerError {
-    #[error("Etcd client error: {0}. Context: {1}")]
-    EtcdClient(etcd_client::Error, String),
-    #[error("Error during parsing etcd key: {0}")]
-    KeyNotParsed(String),
-    #[error("Internal error: {0}")]
-    InternalError(String),
-}
-
-/// Creates a background task to poll etcd for timeline updates from safekeepers.
-/// Stops and returns `Err` on any error during etcd communication.
-/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
-/// exiting normally in such cases.
-/// Etcd values are parsed as json fukes into a type, specified in the generic patameter.
-pub async fn subscribe_for_json_values<V>(
-    client: &mut Client,
-    key: SubscriptionKey,
-) -> Result<BrokerSubscription<V>, BrokerError>
-where
-    V: DeserializeOwned + Send + 'static,
-{
-    subscribe_for_values(client, key, |_, value_str| {
-        match serde_json::from_str::<V>(value_str) {
-            Ok(value) => Some(value),
-            Err(e) => {
-                error!("Failed to parse value str '{value_str}': {e}");
-                None
-            }
-        }
-    })
-    .await
-}
-
-/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string.
-pub async fn subscribe_for_values<P, V>(
-    client: &mut Client,
-    key: SubscriptionKey,
-    value_parser: P,
-) -> Result<BrokerSubscription<V>, BrokerError>
-where
-    V: Send + 'static,
-    P: Fn(SubscriptionFullKey, &str) -> Option<V> + Send + 'static,
-{
-    info!("Subscribing to broker value updates, key: {key:?}");
-    let subscription_key = key.clone();
-
-    let (watcher, mut stream) = client
-        .watch(key.watch_key(), Some(WatchOptions::new().with_prefix()))
-        .await
-        .map_err(|e| {
-            BrokerError::EtcdClient(
-                e,
-                format!("Failed to init the watch for subscription {key:?}"),
-            )
-        })?;
-
-    let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel();
-    let watcher_handle = tokio::spawn(async move {
-        while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
-            "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind
-        )))? {
-            if resp.canceled() {
-                info!("Watch for timeline updates subscription was canceled, exiting");
-                break;
-            }
-
-            let events = resp.events();
-            debug!("Processing {} events", events.len());
-
-            for event in events {
-                if EventType::Put == event.event_type() {
-                    if let Some(new_etcd_kv) = event.kv() {
-                        match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) {
-                            Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate {
-                                etcd_version: new_etcd_kv.version(),
-                                key,
-                                value,
-                            }) {
-                                info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}");
-                                break;
-                            },
-                            Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"),
-                            Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"),
-                            Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"),
-                        };
-                    }
-                }
-            }
-        }
-
-        Ok(())
-    }.instrument(info_span!("etcd_broker")));
-
-    Ok(BrokerSubscription {
-        key: subscription_key,
-        value_updates: value_updates_receiver,
-        watcher_handle,
-        watcher,
-    })
-}
-
-fn parse_etcd_kv<P, V>(
-    kv: &KeyValue,
-    value_parser: &P,
-    cluster_prefix: &str,
-) -> Result<Option<(SubscriptionFullKey, V)>, BrokerError>
-where
-    P: Fn(SubscriptionFullKey, &str) -> Option<V>,
-{
-    let key_str = kv.key_str().map_err(|e| {
-        BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
-    })?;
-    let value_str = kv.value_str().map_err(|e| {
-        BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
-    })?;
-
-    if !key_str.starts_with(cluster_prefix) {
-        return Err(BrokerError::KeyNotParsed(format!(
-            "KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}"
-        )));
-    }
-
-    let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| {
-        BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}"))
-    })?;
-
-    Ok(value_parser(key, value_str).map(|value| (key, value)))
-}
diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs
deleted file mode 100644
index a11d2ab106..0000000000
--- a/libs/etcd_broker/src/subscription_key.rs
+++ /dev/null
@@ -1,310 +0,0 @@
-//! Etcd broker keys, used in the project and shared between instances.
-//! The keys are split into two categories:
-//!
-//! * [`SubscriptionFullKey`] full key format: `<cluster_prefix>/<tenant>/<timeline>/<node_kind>/<operation>/<node_id>`
-//! Always returned from etcd in this form, always start with the user key provided.
-//!
-//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available.
-//! Full key always starts with the user input one, due to etcd subscription properties.
-
-use std::{fmt::Display, str::FromStr};
-
-use once_cell::sync::Lazy;
-use regex::{Captures, Regex};
-use utils::id::{NodeId, TenantId, TenantTimelineId};
-
-/// The subscription kind to the timeline updates from safekeeper.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct SubscriptionKey {
-    /// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups.
-    pub cluster_prefix: String,
-    /// The subscription kind.
-    pub kind: SubscriptionKind,
-}
-
-/// All currently possible key kinds of a etcd broker subscription.
-/// Etcd works so, that every key that starts with the subbscription key given is considered matching and
-/// returned as part of the subscrption.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum SubscriptionKind {
-    /// Get every update in etcd.
-    All,
-    /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
-    TenantTimelines(TenantId),
-    /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
-    Timeline(TenantTimelineId),
-    /// Get etcd timeline updates, specific to a certain node kind.
-    Node(TenantTimelineId, NodeKind),
-    /// Get etcd timeline updates for a certain operation on specific nodes.
-    Operation(TenantTimelineId, NodeKind, OperationKind),
-}
-
-/// All kinds of nodes, able to write into etcd.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum NodeKind {
-    Safekeeper,
-    Pageserver,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum OperationKind {
-    Safekeeper(SkOperationKind),
-}
-
-/// Current operations, running inside the safekeeper node.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum SkOperationKind {
-    TimelineInfo,
-    WalBackup,
-}
-
-static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
-    Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$")
-        .expect("wrong subscription full etcd key regex")
-});
-
-/// Full key, received from etcd during any of the component's work.
-/// No other etcd keys are considered during system's work.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct SubscriptionFullKey {
-    pub id: TenantTimelineId,
-    pub node_kind: NodeKind,
-    pub operation: OperationKind,
-    pub node_id: NodeId,
-}
-
-impl SubscriptionKey {
-    /// Subscribes for all etcd updates.
-    pub fn all(cluster_prefix: String) -> Self {
-        SubscriptionKey {
-            cluster_prefix,
-            kind: SubscriptionKind::All,
-        }
-    }
-
-    /// Subscribes to a given timeline info updates from safekeepers.
-    pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self {
-        Self {
-            cluster_prefix,
-            kind: SubscriptionKind::Operation(
-                timeline,
-                NodeKind::Safekeeper,
-                OperationKind::Safekeeper(SkOperationKind::TimelineInfo),
-            ),
-        }
-    }
-
-    /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
-    pub fn operation(
-        cluster_prefix: String,
-        timeline: TenantTimelineId,
-        node_kind: NodeKind,
-        operation: OperationKind,
-    ) -> Self {
-        Self {
-            cluster_prefix,
-            kind: SubscriptionKind::Operation(timeline, node_kind, operation),
-        }
-    }
-
-    /// Etcd key to use for watching a certain timeline updates from safekeepers.
-    pub fn watch_key(&self) -> String {
-        let cluster_prefix = &self.cluster_prefix;
-        match self.kind {
-            SubscriptionKind::All => cluster_prefix.to_string(),
-            SubscriptionKind::TenantTimelines(tenant_id) => {
-                format!("{cluster_prefix}/{tenant_id}")
-            }
-            SubscriptionKind::Timeline(id) => {
-                format!("{cluster_prefix}/{id}")
-            }
-            SubscriptionKind::Node(id, node_kind) => {
-                format!("{cluster_prefix}/{id}/{node_kind}")
-            }
-            SubscriptionKind::Operation(id, node_kind, operation_kind) => {
-                format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}")
-            }
-        }
-    }
-}
-
-impl Display for OperationKind {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            OperationKind::Safekeeper(o) => o.fmt(f),
-        }
-    }
-}
-
-impl FromStr for OperationKind {
-    type Err = String;
-
-    fn from_str(operation_kind_str: &str) -> Result<Self, Self::Err> {
-        match operation_kind_str {
-            "timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)),
-            "wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)),
-            _ => Err(format!("Unknown operation kind: {operation_kind_str}")),
-        }
-    }
-}
-
-impl Display for SubscriptionFullKey {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let Self {
-            id,
-            node_kind,
-            operation,
-            node_id,
-        } = self;
-        write!(f, "{id}/{node_kind}/{operation}/{node_id}")
-    }
-}
-
-impl FromStr for SubscriptionFullKey {
-    type Err = String;
-
-    fn from_str(subscription_kind_str: &str) -> Result<Self, Self::Err> {
-        let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) {
-            Some(captures) => captures,
-            None => {
-                return Err(format!(
-                    "Subscription kind str does not match a subscription full key regex {}",
-                    SUBSCRIPTION_FULL_KEY_REGEX.as_str()
-                ));
-            }
-        };
-
-        Ok(Self {
-            id: TenantTimelineId::new(
-                parse_capture(&key_captures, 1)?,
-                parse_capture(&key_captures, 2)?,
-            ),
-            node_kind: parse_capture(&key_captures, 3)?,
-            operation: parse_capture(&key_captures, 4)?,
-            node_id: NodeId(parse_capture(&key_captures, 5)?),
-        })
-    }
-}
-
-fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
-where
-    T: FromStr,
-    <T as FromStr>::Err: Display,
-{
-    let capture_match = caps
-        .get(index)
-        .ok_or_else(|| format!("Failed to get capture match at index {index}"))?
-        .as_str();
-    capture_match.parse().map_err(|e| {
-        format!(
-            "Failed to parse {} from {capture_match}: {e}",
-            std::any::type_name::<T>()
-        )
-    })
-}
-
-impl Display for NodeKind {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::Safekeeper => write!(f, "safekeeper"),
-            Self::Pageserver => write!(f, "pageserver"),
-        }
-    }
-}
-
-impl FromStr for NodeKind {
-    type Err = String;
-
-    fn from_str(node_kind_str: &str) -> Result<Self, Self::Err> {
-        match node_kind_str {
-            "safekeeper" => Ok(Self::Safekeeper),
-            "pageserver" => Ok(Self::Pageserver),
-            _ => Err(format!("Invalid node kind: {node_kind_str}")),
-        }
-    }
-}
-
-impl Display for SkOperationKind {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::TimelineInfo => write!(f, "timeline_info"),
-            Self::WalBackup => write!(f, "wal_backup"),
-        }
-    }
-}
-
-impl FromStr for SkOperationKind {
-    type Err = String;
-
-    fn from_str(operation_str: &str) -> Result<Self, Self::Err> {
-        match operation_str {
-            "timeline_info" => Ok(Self::TimelineInfo),
-            "wal_backup" => Ok(Self::WalBackup),
-            _ => Err(format!("Invalid operation: {operation_str}")),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use utils::id::TimelineId;
-
-    use super::*;
-
-    #[test]
-    fn full_cluster_key_parsing() {
-        let prefix = "neon";
-        let node_kind = NodeKind::Safekeeper;
-        let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-        let id = TenantTimelineId::new(tenant_id, timeline_id);
-        let node_id = NodeId(1);
-
-        let timeline_subscription_keys = [
-            SubscriptionKey {
-                cluster_prefix: prefix.to_string(),
-                kind: SubscriptionKind::All,
-            },
-            SubscriptionKey {
-                cluster_prefix: prefix.to_string(),
-                kind: SubscriptionKind::TenantTimelines(tenant_id),
-            },
-            SubscriptionKey {
-                cluster_prefix: prefix.to_string(),
-                kind: SubscriptionKind::Timeline(id),
-            },
-            SubscriptionKey {
-                cluster_prefix: prefix.to_string(),
-                kind: SubscriptionKind::Node(id, node_kind),
-            },
-            SubscriptionKey {
-                cluster_prefix: prefix.to_string(),
-                kind: SubscriptionKind::Operation(id, node_kind, operation_kind),
-            },
-        ];
-
-        let full_key_string = format!(
-            "{}/{node_id}",
-            timeline_subscription_keys.last().unwrap().watch_key()
-        );
-
-        for key in timeline_subscription_keys {
-            assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match");
-        }
-
-        let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| {
-            panic!("Failed to parse {full_key_string} as a subscription full key: {e}")
-        });
-
-        assert_eq!(
-            full_key,
-            SubscriptionFullKey {
-                id,
-                node_kind,
-                operation: operation_kind,
-                node_id
-            }
-        )
-    }
-}
diff --git a/libs/etcd_broker/src/subscription_value.rs b/libs/etcd_broker/src/subscription_value.rs
deleted file mode 100644
index 60a5411926..0000000000
--- a/libs/etcd_broker/src/subscription_value.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-//! Module for the values to put into etcd.
-
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
-use utils::lsn::Lsn;
-
-/// Data about safekeeper's timeline. Fields made optional for easy migrations.
-#[serde_as]
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct SkTimelineInfo {
-    /// Term of the last entry.
-    pub last_log_term: Option<u64>,
-    /// LSN of the last record.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub flush_lsn: Option<Lsn>,
-    /// Up to which LSN safekeeper regards its WAL as committed.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub commit_lsn: Option<Lsn>,
-    /// LSN up to which safekeeper has backed WAL.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub backup_lsn: Option<Lsn>,
-    /// LSN of last checkpoint uploaded by pageserver.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub remote_consistent_lsn: Option<Lsn>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub peer_horizon_lsn: Option<Lsn>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub local_start_lsn: Option<Lsn>,
-    /// A connection string to use for WAL receiving.
-    #[serde(default)]
-    pub safekeeper_connstr: Option<String>,
-}
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 85c6439367..0c1310eef9 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -22,3 +22,40 @@ pub struct TimelineCreateRequest {
     // If not passed, it is assigned to the beginning of commit_lsn segment.
     pub local_start_lsn: Option<Lsn>,
 }
+
+fn lsn_invalid() -> Lsn {
+    Lsn::INVALID
+}
+
+/// Data about safekeeper's timeline, mirrors broker.proto.
+#[serde_as]
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SkTimelineInfo {
+    /// Term of the last entry.
+    pub last_log_term: Option<u64>,
+    /// LSN of the last record.
+    #[serde_as(as = "DisplayFromStr")]
+    #[serde(default = "lsn_invalid")]
+    pub flush_lsn: Lsn,
+    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "DisplayFromStr")]
+    #[serde(default = "lsn_invalid")]
+    pub commit_lsn: Lsn,
+    /// LSN up to which safekeeper has backed WAL.
+    #[serde_as(as = "DisplayFromStr")]
+    #[serde(default = "lsn_invalid")]
+    pub backup_lsn: Lsn,
+    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "DisplayFromStr")]
+    #[serde(default = "lsn_invalid")]
+    pub remote_consistent_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    #[serde(default = "lsn_invalid")]
+    pub peer_horizon_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    #[serde(default = "lsn_invalid")]
+    pub local_start_lsn: Lsn,
+    /// A connection string to use for WAL receiving.
+    #[serde(default)]
+    pub safekeeper_connstr: Option<String>,
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 43d51f90c1..54bbe4714d 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -59,13 +59,13 @@ tracing = "0.1.36"
 url = "2"
 walkdir = "2.3.2"
 
-etcd_broker = { path = "../libs/etcd_broker" }
 metrics = { path = "../libs/metrics" }
 pageserver_api = { path = "../libs/pageserver_api" }
 postgres_connection = { path = "../libs/postgres_connection" }
 postgres_ffi = { path = "../libs/postgres_ffi" }
 pq_proto = { path = "../libs/pq_proto" }
 remote_storage = { path = "../libs/remote_storage" }
+storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d70b36616b..345f391e61 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -247,7 +247,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     // start profiler (if enabled)
     let profiler_guard = profiling::init_profiler(conf);
 
-    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?;
+    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
 
     // initialize authentication for incoming connections
     let auth = match &conf.auth_type {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 86f1fcef94..c07907a1c9 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,6 +7,7 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
+use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 
@@ -18,7 +19,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use toml_edit;
 use toml_edit::{Document, Item};
-use url::Url;
+
 use utils::{
     id::{NodeId, TenantId, TimelineId},
     logging::LogFormat,
@@ -39,6 +40,7 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
+    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
@@ -59,7 +61,6 @@ pub mod defaults {
     pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
         r###"
 # Initial configuration file created by 'pageserver --init'
-
 #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
 
@@ -71,6 +72,8 @@ pub mod defaults {
 # initial superuser role name to use when creating a new tenant
 #initial_superuser_name = '{DEFAULT_SUPERUSER}'
 
+#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
+
 #log_format = '{DEFAULT_LOG_FORMAT}'
 
 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
@@ -132,12 +135,8 @@ pub struct PageServerConf {
     pub profiling: ProfilingConfig,
     pub default_tenant_conf: TenantConf,
 
-    /// A prefix to add in etcd brokers before every key.
-    /// Can be used for isolating different pageserver groups within the same etcd cluster.
-    pub broker_etcd_prefix: String,
-
-    /// Etcd broker endpoints to connect to.
-    pub broker_endpoints: Vec<Url>,
+    /// Storage broker endpoints to connect to.
+    pub broker_endpoint: Uri,
 
     pub log_format: LogFormat,
 
@@ -148,8 +147,7 @@ pub struct PageServerConf {
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
-/// the future, more tokens and auth may arrive for etcd and/or its rewrite (see
-/// https://github.com/neondatabase/neon/issues/2394), completely changing the logic.
+/// the future, more tokens and auth may arrive for storage broker, completely changing the logic.
 /// Hence, we resort to a global variable for now instead of passing the token from the
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
@@ -216,8 +214,7 @@ struct PageServerConfigBuilder {
     id: BuilderValue<NodeId>,
 
     profiling: BuilderValue<ProfilingConfig>,
-    broker_etcd_prefix: BuilderValue<String>,
-    broker_endpoints: BuilderValue<Vec<Url>>,
+    broker_endpoint: BuilderValue<Uri>,
 
     log_format: BuilderValue<LogFormat>,
 
@@ -247,8 +244,9 @@ impl Default for PageServerConfigBuilder {
             remote_storage_config: Set(None),
             id: NotSet,
             profiling: Set(ProfilingConfig::Disabled),
-            broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
-            broker_endpoints: Set(Vec::new()),
+            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
+                .parse()
+                .expect("failed to parse default broker endpoint")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
@@ -308,12 +306,8 @@ impl PageServerConfigBuilder {
         self.remote_storage_config = BuilderValue::Set(remote_storage_config)
     }
 
-    pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
-        self.broker_endpoints = BuilderValue::Set(broker_endpoints)
-    }
-
-    pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
-        self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
+    pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
+        self.broker_endpoint = BuilderValue::Set(broker_endpoint)
     }
 
     pub fn id(&mut self, node_id: NodeId) {
@@ -368,12 +362,9 @@ impl PageServerConfigBuilder {
             profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
             // TenantConf is handled separately
             default_tenant_conf: TenantConf::default(),
-            broker_endpoints: self
-                .broker_endpoints
+            broker_endpoint: self
+                .broker_endpoint
                 .ok_or(anyhow!("No broker endpoints provided"))?,
-            broker_etcd_prefix: self
-                .broker_etcd_prefix
-                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
             concurrent_tenant_size_logical_size_queries: self
                 .concurrent_tenant_size_logical_size_queries
@@ -540,17 +531,7 @@ impl PageServerConf {
                 }
                 "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                 "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
-                "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
-                "broker_endpoints" => builder.broker_endpoints(
-                    parse_toml_array(key, item)?
-                        .into_iter()
-                        .map(|endpoint_str| {
-                            endpoint_str.parse::<Url>().with_context(|| {
-                                format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
-                            })
-                        })
-                        .collect::<anyhow::Result<_>>()?,
-                ),
+                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "log_format" => builder.log_format(
                     LogFormat::from_config(&parse_toml_string(key, item)?)?
                 ),
@@ -677,8 +658,7 @@ impl PageServerConf {
             remote_storage_config: None,
             profiling: ProfilingConfig::Disabled,
             default_tenant_conf: TenantConf::dummy_conf(),
-            broker_endpoints: Vec::new(),
-            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
         }
@@ -730,22 +710,6 @@ where
     })
 }
 
-fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
-    let array = item
-        .as_array()
-        .with_context(|| format!("configure option {name} is not an array"))?;
-
-    array
-        .iter()
-        .map(|value| {
-            value
-                .as_str()
-                .map(str::to_string)
-                .with_context(|| format!("Array item {value:?} for key {name} is not a string"))
-        })
-        .collect()
-}
-
 /// Configurable semaphore permits setting.
 ///
 /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -835,10 +799,10 @@ log_format = 'json'
     fn parse_defaults() -> anyhow::Result<()> {
         let tempdir = tempdir()?;
         let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = "http://127.0.0.1:7777";
+        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
         // we have to create dummy values to overcome the validation errors
         let config_string = format!(
-            "pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
+            "pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
             pg_distrib_dir.display()
         );
         let toml = config_string.parse()?;
@@ -864,10 +828,7 @@ log_format = 'json'
                 remote_storage_config: None,
                 profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
-                broker_endpoints: vec![broker_endpoint
-                    .parse()
-                    .expect("Failed to parse a valid broker endpoint URL")],
-                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             },
@@ -881,10 +842,10 @@ log_format = 'json'
     fn parse_basic_config() -> anyhow::Result<()> {
         let tempdir = tempdir()?;
         let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = "http://127.0.0.1:7777";
+        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
 
         let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
+            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
             pg_distrib_dir.display()
         );
         let toml = config_string.parse()?;
@@ -910,10 +871,7 @@ log_format = 'json'
                 remote_storage_config: None,
                 profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
-                broker_endpoints: vec![broker_endpoint
-                    .parse()
-                    .expect("Failed to parse a valid broker endpoint URL")],
-                broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             },
@@ -947,7 +905,7 @@ local_path = '{}'"#,
             let config_string = format!(
                 r#"{ALL_BASE_VALUES_TOML}
 pg_distrib_dir='{}'
-broker_endpoints = ['{broker_endpoint}']
+broker_endpoint = '{broker_endpoint}'
 
 {remote_storage_config_str}"#,
                 pg_distrib_dir.display(),
@@ -1014,7 +972,7 @@ concurrency_limit = {s3_concurrency_limit}"#
             let config_string = format!(
                 r#"{ALL_BASE_VALUES_TOML}
 pg_distrib_dir='{}'
-broker_endpoints = ['{broker_endpoint}']
+broker_endpoint = '{broker_endpoint}'
 
 {remote_storage_config_str}"#,
                 pg_distrib_dir.display(),
@@ -1059,7 +1017,7 @@ broker_endpoints = ['{broker_endpoint}']
         let config_string = format!(
             r#"{ALL_BASE_VALUES_TOML}
 pg_distrib_dir='{}'
-broker_endpoints = ['{broker_endpoint}']
+broker_endpoint = '{broker_endpoint}'
 
 [tenant_config]
 trace_read_requests = {trace_read_requests}"#,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 3462f4eb82..3325ce01d4 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -71,7 +71,7 @@ use crate::shutdown_pageserver;
 //
 // WAL receiver runtime:
 //  - used to handle WAL receiver connections.
-//  - and to receiver updates from etcd
+//  - and to receiver updates from storage_broker
 //
 // Background runtime
 //  - layer flushing
@@ -178,7 +178,7 @@ pub enum TaskKind {
     PageRequestHandler,
 
     // Manages the WAL receiver connection for one timeline. It subscribes to
-    // events from etcd, decides which safekeeper to connect to. It spawns a
+    // events from storage_broker, decides which safekeeper to connect to. It spawns a
     // separate WalReceiverConnection task to handle each connection.
     WalReceiverManager,
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index aab5d6f1d3..ed60530f83 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,7 @@ use utils::{
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
-use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task};
+use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
 use crate::METADATA_FILE_NAME;
@@ -856,12 +856,12 @@ impl Timeline {
     }
 
     pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
-        if !is_etcd_client_initialized() {
+        if !is_broker_client_initialized() {
             if cfg!(test) {
-                info!("not launching WAL receiver because etcd client hasn't been initialized");
+                info!("not launching WAL receiver because broker client hasn't been initialized");
                 return;
             } else {
-                panic!("etcd client not initialized");
+                panic!("broker client not initialized");
             }
         }
 
@@ -882,7 +882,6 @@ impl Timeline {
         drop(tenant_conf_guard);
         let self_clone = Arc::clone(self);
         spawn_connection_manager_task(
-            self.conf.broker_etcd_prefix.clone(),
             self_clone,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs
index 1fad91c836..e627e9ecd0 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -6,7 +6,7 @@
 //! hence WAL receiver needs to react on such events.
 //!
 //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
-//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically.
+//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
 //! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
 //! Without this data, no WAL streaming is possible currently.
 //!
@@ -26,57 +26,49 @@ mod walreceiver_connection;
 use crate::config::PageServerConf;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
 
-use anyhow::{ensure, Context};
-use etcd_broker::Client;
-use itertools::Itertools;
+use anyhow::Context;
 use once_cell::sync::OnceCell;
 use std::future::Future;
+use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tracing::*;
-use url::Url;
 
 pub use connection_manager::spawn_connection_manager_task;
 
-static ETCD_CLIENT: OnceCell<Client> = OnceCell::new();
+static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
 
 ///
-/// Initialize the etcd client. This must be called once at page server startup.
+/// Initialize the broker client. This must be called once at page server startup.
 ///
-pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let etcd_endpoints = conf.broker_endpoints.clone();
-    ensure!(
-        !etcd_endpoints.is_empty(),
-        "Cannot start wal receiver: etcd endpoints are empty"
-    );
+pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    let broker_endpoint = conf.broker_endpoint.clone();
 
-    let etcd_client = Client::connect(etcd_endpoints.clone(), None)
-        .await
-        .context("Failed to connect to etcd")?;
+    // Note: we do not attempt connecting here (but validate endpoints sanity).
+    let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!(
+        "Failed to create broker client to {}",
+        &conf.broker_endpoint
+    ))?;
 
-    // FIXME: Should we still allow the pageserver to start, if etcd
-    // doesn't work? It could still serve GetPage requests, with the
-    // data it has locally and from what it can download from remote
-    // storage
-    if ETCD_CLIENT.set(etcd_client).is_err() {
-        panic!("etcd already initialized");
+    if BROKER_CLIENT.set(broker_client).is_err() {
+        panic!("broker already initialized");
     }
 
     info!(
-        "Initialized etcd client with endpoints: {}",
-        etcd_endpoints.iter().map(Url::to_string).join(", ")
+        "Initialized broker client with endpoints: {}",
+        broker_endpoint
     );
     Ok(())
 }
 
 ///
-/// Get a handle to the etcd client
+/// Get a handle to the broker client
 ///
-pub fn get_etcd_client() -> &'static etcd_broker::Client {
-    ETCD_CLIENT.get().expect("etcd client not initialized")
+pub fn get_broker_client() -> &'static BrokerClientChannel {
+    BROKER_CLIENT.get().expect("broker client not initialized")
 }
 
-pub fn is_etcd_client_initialized() -> bool {
-    ETCD_CLIENT.get().is_some()
+pub fn is_broker_client_initialized() -> bool {
+    BROKER_CLIENT.get().is_some()
 }
 
 /// A handle of an asynchronous task.
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index c598f20b10..8048707480 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -1,21 +1,15 @@
 //! WAL receiver logic that ensures the pageserver gets connectected to safekeeper,
 //! that contains the latest WAL to stream and this connection does not go stale.
 //!
-//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it,
+//! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it,
 //! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection.
 //! Current connection state is tracked too, to ensure it's not getting stale.
 //!
-//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
+//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
 //! then a [re]connection happens, if necessary.
-//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel.
+//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
 
-use std::{
-    collections::{hash_map, HashMap},
-    num::NonZeroU64,
-    ops::ControlFlow,
-    sync::Arc,
-    time::Duration,
-};
+use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
 
 use crate::task_mgr::TaskKind;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
@@ -23,16 +17,18 @@ use crate::tenant::Timeline;
 use crate::{task_mgr, walreceiver::TaskStateUpdate};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
-use etcd_broker::{
-    subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
-    BrokerUpdate, Client,
-};
 use pageserver_api::models::TimelineState;
+use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+use storage_broker::BrokerClientChannel;
+use storage_broker::Streaming;
 use tokio::{select, sync::watch};
 use tracing::*;
 
 use crate::{
-    exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS,
+    exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
     DEFAULT_MAX_BACKOFF_SECONDS,
 };
 use postgres_connection::{parse_host_port, PgConnectionConfig};
@@ -45,14 +41,13 @@ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
 
 /// Spawns the loop to take care of the timeline's WAL streaming connection.
 pub fn spawn_connection_manager_task(
-    broker_loop_prefix: String,
     timeline: Arc<Timeline>,
     wal_connect_timeout: Duration,
     lagging_wal_timeout: Duration,
     max_lsn_wal_lag: NonZeroU64,
     auth_token: Option<Arc<String>>,
 ) {
-    let mut etcd_client = get_etcd_client().clone();
+    let mut broker_client = get_broker_client().clone();
 
     let tenant_id = timeline.tenant_id;
     let timeline_id = timeline.timeline_id;
@@ -65,7 +60,7 @@ pub fn spawn_connection_manager_task(
         &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
         false,
         async move {
-            info!("WAL receiver broker started, connecting to etcd");
+            info!("WAL receiver manager started, connecting to broker");
             let mut walreceiver_state = WalreceiverState::new(
                 timeline,
                 wal_connect_timeout,
@@ -81,8 +76,7 @@ pub fn spawn_connection_manager_task(
                         return Ok(());
                     },
                     loop_step_result = connection_manager_loop_step(
-                        &broker_loop_prefix,
-                        &mut etcd_client,
+                        &mut broker_client,
                         &mut walreceiver_state,
                     ) => match loop_step_result {
                         ControlFlow::Continue(()) => continue,
@@ -103,10 +97,9 @@ pub fn spawn_connection_manager_task(
 
 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
-/// If etcd subscription is cancelled, exits.
+/// If storage broker subscription is cancelled, exits.
 async fn connection_manager_loop_step(
-    broker_prefix: &str,
-    etcd_client: &mut Client,
+    broker_client: &mut BrokerClientChannel,
     walreceiver_state: &mut WalreceiverState,
 ) -> ControlFlow<(), ()> {
     let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
@@ -124,13 +117,11 @@ async fn connection_manager_loop_step(
         timeline_id: walreceiver_state.timeline.timeline_id,
     };
 
-    // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go,
-    // running the entire loop step as much as possible to an end.
-    // The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end,
-    // forcing the etcd subscription to exit either way.
-    let mut broker_subscription =
-        subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await;
-    info!("Subscribed for etcd timeline changes, waiting for new etcd data");
+    // Subscribe to the broker updates. Stream shares underlying TCP connection
+    // with other streams on this client (other connection managers). When
+    // object goes out of scope, stream finishes in drop() automatically.
+    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
+    info!("Subscribed for broker timeline updates");
 
     loop {
         let time_until_next_retry = walreceiver_state.time_until_next_retry();
@@ -145,12 +136,6 @@ async fn connection_manager_loop_step(
         //      - this might change the current desired connection
         //  - timeline state changes to something that does not allow walreceiver to run concurrently
         select! {
-            broker_connection_result = &mut broker_subscription.watcher_handle => {
-                info!("Broker connection was closed from the other side, ending current broker loop step");
-                cleanup_broker_connection(broker_connection_result, walreceiver_state);
-                return ControlFlow::Continue(());
-            },
-
             Some(wal_connection_update) = async {
                 match walreceiver_state.wal_connection.as_mut() {
                     Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -185,22 +170,16 @@ async fn connection_manager_loop_step(
                 }
             },
 
-            // Got a new update from etcd
-            broker_update = broker_subscription.value_updates.recv() => {
+            // Got a new update from the broker
+            broker_update = broker_subscription.message() => {
                 match broker_update {
-                    Some(broker_update) => walreceiver_state.register_timeline_update(broker_update),
-                    None => {
-                        info!("Broker sender end was dropped, ending current broker loop step");
-                        // Ensure to cancel and wait for the broker subscription task end, to log its result.
-                        // Broker sender end is in the broker subscription task and its drop means abnormal task completion.
-                        // First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times).
-                        broker_subscription.watcher_handle.abort();
-                        // Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case),
-                        // a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway.
-                        cleanup_broker_connection(
-                            (&mut broker_subscription.watcher_handle).await,
-                            walreceiver_state,
-                        );
+                    Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update),
+                    Err(e) => {
+                        error!("broker subscription failed: {e}");
+                        return ControlFlow::Continue(());
+                    }
+                    Ok(None) => {
+                        error!("broker subscription stream ended"); // can't happen
                         return ControlFlow::Continue(());
                     }
                 }
@@ -234,17 +213,6 @@ async fn connection_manager_loop_step(
             _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
         }
 
-        // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly.
-        let mut max_events_to_poll = 100_u32;
-        while max_events_to_poll > 0 {
-            if let Ok(broker_update) = broker_subscription.value_updates.try_recv() {
-                walreceiver_state.register_timeline_update(broker_update);
-                max_events_to_poll -= 1;
-            } else {
-                break;
-            }
-        }
-
         if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
             info!("Switching to new connection candidate: {new_candidate:?}");
             walreceiver_state
@@ -285,33 +253,11 @@ async fn wait_for_active_timeline(
     }
 }
 
-fn cleanup_broker_connection(
-    broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
-    walreceiver_state: &mut WalreceiverState,
-) {
-    match broker_connection_result {
-        Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"),
-        Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"),
-        Err(abort_error) => {
-            if abort_error.is_panic() {
-                error!("Broker connection panicked: {abort_error}")
-            } else {
-                debug!("Broker connection aborted: {abort_error}")
-            }
-        }
-    }
-
-    walreceiver_state.wal_stream_candidates.clear();
-}
-
 /// Endlessly try to subscribe for broker updates for a given timeline.
-/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly.
-/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway.
 async fn subscribe_for_timeline_updates(
-    etcd_client: &mut Client,
-    broker_prefix: &str,
+    broker_client: &mut BrokerClientChannel,
     id: TenantTimelineId,
-) -> BrokerSubscription<SkTimelineInfo> {
+) -> Streaming<SafekeeperTimelineInfo> {
     let mut attempt = 0;
     loop {
         exponential_backoff(
@@ -322,18 +268,21 @@ async fn subscribe_for_timeline_updates(
         .await;
         attempt += 1;
 
-        match etcd_broker::subscribe_for_json_values(
-            etcd_client,
-            SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id),
-        )
-        .instrument(info_span!("etcd_subscription"))
-        .await
-        {
-            Ok(new_subscription) => {
-                return new_subscription;
+        // subscribe to the specific timeline
+        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+            tenant_id: id.tenant_id.as_ref().to_owned(),
+            timeline_id: id.timeline_id.as_ref().to_owned(),
+        });
+        let request = SubscribeSafekeeperInfoRequest {
+            subscription_key: Some(key),
+        };
+
+        match broker_client.subscribe_safekeeper_info(request).await {
+            Ok(resp) => {
+                return resp.into_inner();
             }
             Err(e) => {
-                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}");
+                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
                 continue;
             }
         }
@@ -360,8 +309,8 @@ struct WalreceiverState {
     wal_connection: Option<WalConnection>,
     /// Info about retries and unsuccessful attempts to connect to safekeepers.
     wal_connection_retries: HashMap<NodeId, RetryInfo>,
-    /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id.
-    wal_stream_candidates: HashMap<NodeId, EtcdSkTimeline>,
+    /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
+    wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
     auth_token: Option<Arc<String>>,
 }
 
@@ -395,13 +344,11 @@ struct RetryInfo {
     retry_duration_seconds: f64,
 }
 
-/// Data about the timeline to connect to, received from etcd.
+/// Data about the timeline to connect to, received from the broker.
 #[derive(Debug)]
-struct EtcdSkTimeline {
-    timeline: SkTimelineInfo,
-    /// Etcd generation, the bigger it is, the more up to date the timeline data is.
-    etcd_version: i64,
-    /// Time at which the data was fetched from etcd last time, to track the stale data.
+struct BrokerSkTimeline {
+    timeline: SafekeeperTimelineInfo,
+    /// Time at which the data was fetched from the broker last time, to track the stale data.
     latest_update: NaiveDateTime,
 }
 
@@ -538,31 +485,18 @@ impl WalreceiverState {
         next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
     }
 
-    /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, timeline_update: BrokerUpdate<SkTimelineInfo>) {
-        match self
-            .wal_stream_candidates
-            .entry(timeline_update.key.node_id)
-        {
-            hash_map::Entry::Occupied(mut o) => {
-                let existing_value = o.get_mut();
-                if existing_value.etcd_version < timeline_update.etcd_version {
-                    existing_value.etcd_version = timeline_update.etcd_version;
-                    existing_value.timeline = timeline_update.value;
-                    existing_value.latest_update = Utc::now().naive_utc();
-                }
-            }
-            hash_map::Entry::Vacant(v) => {
-                v.insert(EtcdSkTimeline {
-                    timeline: timeline_update.value,
-                    etcd_version: timeline_update.etcd_version,
-                    latest_update: Utc::now().naive_utc(),
-                });
-            }
-        }
+    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
+    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+        self.wal_stream_candidates.insert(
+            NodeId(timeline_update.safekeeper_id),
+            BrokerSkTimeline {
+                timeline: timeline_update,
+                latest_update: Utc::now().naive_utc(),
+            },
+        );
     }
 
-    /// Cleans up stale etcd records and checks the rest for the new connection candidate.
+    /// Cleans up stale broker records and checks the rest for the new connection candidate.
     /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
     /// The current rules for approving new candidates:
     /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
@@ -585,7 +519,7 @@ impl WalreceiverState {
             Some(existing_wal_connection) => {
                 let connected_sk_node = existing_wal_connection.sk_id;
 
-                let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connconf) =
+                let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
                     self.select_connection_candidate(Some(connected_sk_node))?;
 
                 let now = Utc::now().naive_utc();
@@ -614,7 +548,7 @@ impl WalreceiverState {
                 }
 
                 if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
-                    let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
+                    let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
                     // Check if the new candidate has much more WAL than the current one.
                     match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
                         Some(new_sk_lsn_advantage) => {
@@ -644,7 +578,7 @@ impl WalreceiverState {
                     .status
                     .commit_lsn
                     .unwrap_or(current_lsn);
-                let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
+                let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
 
                 // Keep discovered_new_wal only if connected safekeeper has not caught up yet.
                 let mut discovered_new_wal = existing_wal_connection
@@ -727,7 +661,7 @@ impl WalreceiverState {
         None
     }
 
-    /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
+    /// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers.
     /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
     ///
     /// The candidate that is chosen:
@@ -736,7 +670,7 @@ impl WalreceiverState {
     fn select_connection_candidate(
         &self,
         node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SkTimelineInfo, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
         self.applicable_connection_candidates()
             .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
             .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -746,12 +680,12 @@ impl WalreceiverState {
     /// Some safekeepers are filtered by the retry cooldown.
     fn applicable_connection_candidates(
         &self,
-    ) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
         let now = Utc::now().naive_utc();
 
         self.wal_stream_candidates
             .iter()
-            .filter(|(_, info)| info.timeline.commit_lsn.is_some())
+            .filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID)
             .filter(move |(sk_id, _)| {
                 let next_retry_at = self
                     .wal_connection_retries
@@ -761,12 +695,14 @@ impl WalreceiverState {
                     });
 
                 next_retry_at.is_none() || next_retry_at.unwrap() <= now
-            })
-            .filter_map(|(sk_id, etcd_info)| {
-                let info = &etcd_info.timeline;
+            }).filter_map(|(sk_id, broker_info)| {
+                let info = &broker_info.timeline;
+                if info.safekeeper_connstr.is_empty() {
+                    return None; // no connection string, ignore sk
+                }
                 match wal_stream_connection_config(
                     self.id,
-                    info.safekeeper_connstr.as_deref()?,
+                    info.safekeeper_connstr.as_ref(),
                     match &self.auth_token {
                         None => None,
                         Some(x) => Some(x),
@@ -781,15 +717,15 @@ impl WalreceiverState {
             })
     }
 
-    /// Remove candidates which haven't sent etcd updates for a while.
+    /// Remove candidates which haven't sent broker updates for a while.
     fn cleanup_old_candidates(&mut self) {
         let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
 
-        self.wal_stream_candidates.retain(|node_id, etcd_info| {
-            if let Ok(time_since_latest_etcd_update) =
-                (Utc::now().naive_utc() - etcd_info.latest_update).to_std()
+        self.wal_stream_candidates.retain(|node_id, broker_info| {
+            if let Ok(time_since_latest_broker_update) =
+                (Utc::now().naive_utc() - broker_info.latest_update).to_std()
             {
-                let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
+                let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout;
                 if !should_retain {
                     node_ids_to_remove.push(*node_id);
                 }
@@ -870,6 +806,28 @@ mod tests {
     use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
     use url::Host;
 
+    fn dummy_broker_sk_timeline(
+        commit_lsn: u64,
+        safekeeper_connstr: &str,
+        latest_update: NaiveDateTime,
+    ) -> BrokerSkTimeline {
+        BrokerSkTimeline {
+            timeline: SafekeeperTimelineInfo {
+                safekeeper_id: 0,
+                tenant_timeline_id: None,
+                last_log_term: 0,
+                flush_lsn: 0,
+                commit_lsn,
+                backup_lsn: 0,
+                remote_consistent_lsn: 0,
+                peer_horizon_lsn: 0,
+                local_start_lsn: 0,
+                safekeeper_connstr: safekeeper_connstr.to_owned(),
+            },
+            latest_update,
+        }
+    }
+
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
         let harness = TenantHarness::create("no_connection_no_candidate")?;
@@ -881,74 +839,16 @@ mod tests {
 
         state.wal_connection = None;
         state.wal_stream_candidates = HashMap::from([
-            (
-                NodeId(0),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(1)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-                        safekeeper_connstr: None,
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
-            ),
-            (
-                NodeId(1),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: None,
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
-            ),
-            (
-                NodeId(2),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: None,
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-                        safekeeper_connstr: Some("no_commit_lsn".to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
-            ),
+            (NodeId(0), dummy_broker_sk_timeline(1, "", now)),
+            (NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
+            (NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
             (
                 NodeId(3),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-                        safekeeper_connstr: None,
-                    },
-                    etcd_version: 0,
-                    latest_update: delay_over_threshold,
-                },
+                dummy_broker_sk_timeline(
+                    1 + state.max_lsn_wal_lag.get(),
+                    "delay_over_threshold",
+                    delay_over_threshold,
+                ),
             ),
         ]);
 
@@ -995,57 +895,23 @@ mod tests {
         state.wal_stream_candidates = HashMap::from([
             (
                 connected_sk_id,
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(
+                    current_lsn + state.max_lsn_wal_lag.get() * 2,
+                    DUMMY_SAFEKEEPER_HOST,
+                    now,
+                ),
             ),
             (
                 NodeId(1),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(current_lsn)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("not_advanced_lsn".to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now),
             ),
             (
                 NodeId(2),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(
+                    current_lsn + state.max_lsn_wal_lag.get() / 2,
+                    "not_enough_advanced_lsn",
+                    now,
+                ),
             ),
         ]);
 
@@ -1067,21 +933,7 @@ mod tests {
         state.wal_connection = None;
         state.wal_stream_candidates = HashMap::from([(
             NodeId(0),
-            EtcdSkTimeline {
-                timeline: SkTimelineInfo {
-                    last_log_term: None,
-                    flush_lsn: None,
-                    commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
-                    backup_lsn: None,
-                    remote_consistent_lsn: None,
-                    peer_horizon_lsn: None,
-                    local_start_lsn: None,
-
-                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                },
-                etcd_version: 0,
-                latest_update: now,
-            },
+            dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now),
         )]);
 
         let only_candidate = state
@@ -1102,57 +954,15 @@ mod tests {
         state.wal_stream_candidates = HashMap::from([
             (
                 NodeId(0),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(selected_lsn - 100)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now),
             ),
             (
                 NodeId(1),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(selected_lsn)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now),
             ),
             (
                 NodeId(2),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(Lsn(selected_lsn + 100)),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: None,
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(selected_lsn + 100, "", now),
             ),
         ]);
         let biggest_wal_candidate = state.next_connection_candidate().expect(
@@ -1186,39 +996,11 @@ mod tests {
         state.wal_stream_candidates = HashMap::from([
             (
                 NodeId(0),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(bigger_lsn),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
             ),
             (
                 NodeId(1),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(current_lsn),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
             ),
         ]);
         state.wal_connection_retries = HashMap::from([(
@@ -1275,39 +1057,11 @@ mod tests {
         state.wal_stream_candidates = HashMap::from([
             (
                 connected_sk_id,
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(current_lsn),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
             ),
             (
                 NodeId(1),
-                EtcdSkTimeline {
-                    timeline: SkTimelineInfo {
-                        last_log_term: None,
-                        flush_lsn: None,
-                        commit_lsn: Some(new_lsn),
-                        backup_lsn: None,
-                        remote_consistent_lsn: None,
-                        peer_horizon_lsn: None,
-                        local_start_lsn: None,
-
-                        safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
-                    },
-                    etcd_version: 0,
-                    latest_update: now,
-                },
+                dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now),
             ),
         ]);
 
@@ -1367,21 +1121,7 @@ mod tests {
         });
         state.wal_stream_candidates = HashMap::from([(
             NodeId(0),
-            EtcdSkTimeline {
-                timeline: SkTimelineInfo {
-                    last_log_term: None,
-                    flush_lsn: None,
-                    commit_lsn: Some(current_lsn),
-                    backup_lsn: None,
-                    remote_consistent_lsn: None,
-                    peer_horizon_lsn: None,
-                    local_start_lsn: None,
-
-                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                },
-                etcd_version: 0,
-                latest_update: now,
-            },
+            dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
         )]);
 
         let over_threshcurrent_candidate = state.next_connection_candidate().expect(
@@ -1441,21 +1181,7 @@ mod tests {
         });
         state.wal_stream_candidates = HashMap::from([(
             NodeId(0),
-            EtcdSkTimeline {
-                timeline: SkTimelineInfo {
-                    last_log_term: None,
-                    flush_lsn: None,
-                    commit_lsn: Some(new_lsn),
-                    backup_lsn: None,
-                    remote_consistent_lsn: None,
-                    peer_horizon_lsn: None,
-                    local_start_lsn: None,
-
-                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
-                },
-                etcd_version: 0,
-                latest_update: now,
-            },
+            dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
         )]);
 
         let over_threshcurrent_candidate = state.next_connection_candidate().expect(
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 658bdfe42c..daee368b12 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
+async-stream = "0.3"
 anyhow = "1.0"
 async-trait = "0.1"
 byteorder = "1.4.3"
@@ -33,12 +34,12 @@ toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"
 url = "2.2.2"
 
-etcd_broker = { path = "../libs/etcd_broker" }
 metrics = { path = "../libs/metrics" }
 postgres_ffi = { path = "../libs/postgres_ffi" }
 pq_proto = { path = "../libs/pq_proto" }
 remote_storage = { path = "../libs/remote_storage" }
 safekeeper_api = { path = "../libs/safekeeper_api" }
+storage_broker = { version = "0.1", path = "../storage_broker" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 45f0f2f5b2..d6ce5f8ac4 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -13,7 +13,6 @@ use std::thread;
 use tokio::sync::mpsc;
 use toml_edit::Document;
 use tracing::*;
-use url::{ParseError, Url};
 use utils::pid_file;
 
 use metrics::set_build_info_metric;
@@ -29,6 +28,7 @@ use safekeeper::wal_backup;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
+use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::JwtAuth;
 use utils::{
     http::endpoint,
@@ -82,12 +82,8 @@ fn main() -> anyhow::Result<()> {
         ));
     }
 
-    if let Some(addr) = arg_matches.get_one::<String>("broker-endpoints") {
-        let collected_ep: Result<Vec<Url>, ParseError> = addr.split(',').map(Url::parse).collect();
-        conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?;
-    }
-    if let Some(prefix) = arg_matches.get_one::<String>("broker-etcd-prefix") {
-        conf.broker_etcd_prefix = prefix.to_string();
+    if let Some(addr) = arg_matches.get_one::<String>("broker-endpoint") {
+        conf.broker_endpoint = addr.parse().context("failed to parse broker endpoint")?;
     }
 
     if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") {
@@ -224,19 +220,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
 
     threads.push(safekeeper_thread);
 
-    if !conf.broker_endpoints.is_empty() {
-        let conf_ = conf.clone();
-        threads.push(
-            thread::Builder::new()
-                .name("broker thread".into())
-                .spawn(|| {
-                    // TODO: add auth?
-                    broker::thread_main(conf_);
-                })?,
-        );
-    } else {
-        warn!("No broker endpoints providing, starting without node sync")
-    }
+    let conf_ = conf.clone();
+    threads.push(
+        thread::Builder::new()
+            .name("broker thread".into())
+            .spawn(|| {
+                // TODO: add auth?
+                broker::thread_main(conf_);
+            })?,
+    );
 
     let conf_ = conf.clone();
     threads.push(
@@ -369,14 +361,9 @@ fn cli() -> Command {
         .arg(
             Arg::new("id").long("id").help("safekeeper node id: integer")
         ).arg(
-            Arg::new("broker-endpoints")
-            .long("broker-endpoints")
-            .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"),
-        )
-        .arg(
-            Arg::new("broker-etcd-prefix")
-            .long("broker-etcd-prefix")
-            .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"),
+            Arg::new("broker-endpoint")
+            .long("broker-endpoint")
+            .help(formatcp!("Broker endpoint for storage nodes coordination in the form http[s]://host:port, default '{DEFAULT_ENDPOINT}'. In case of https schema TLS is connection is established; plaintext otherwise.")),
         )
         .arg(
             Arg::new("heartbeat-timeout")
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 76135241b9..df2dc92efe 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -1,15 +1,18 @@
-//! Communication with etcd, providing safekeeper peers and pageserver coordination.
+//! Communication with the broker, providing safekeeper peers and pageserver coordination.
 
+use anyhow::anyhow;
+use anyhow::bail;
 use anyhow::Context;
+
 use anyhow::Error;
 use anyhow::Result;
-use etcd_broker::subscription_value::SkTimelineInfo;
-use etcd_broker::LeaseKeepAliveStream;
-use etcd_broker::LeaseKeeper;
 
-use std::collections::hash_map::Entry;
-use std::collections::HashMap;
-use std::collections::HashSet;
+use storage_broker::parse_proto_ttid;
+use storage_broker::proto::broker_service_client::BrokerServiceClient;
+use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
+use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+use storage_broker::Request;
+
 use std::time::Duration;
 use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
@@ -17,15 +20,9 @@ use tracing::*;
 
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
-use etcd_broker::{
-    subscription_key::{OperationKind, SkOperationKind, SubscriptionKey},
-    Client, PutOptions,
-};
-use utils::id::{NodeId, TenantTimelineId};
 
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
-const LEASE_TTL_SEC: i64 = 10;
 
 pub fn thread_main(conf: SafeKeeperConf) {
     let runtime = runtime::Builder::new_current_thread()
@@ -34,158 +31,70 @@ pub fn thread_main(conf: SafeKeeperConf) {
         .unwrap();
 
     let _enter = info_span!("broker").entered();
-    info!("started, broker endpoints {:?}", conf.broker_endpoints);
+    info!("started, broker endpoint {:?}", conf.broker_endpoint);
 
     runtime.block_on(async {
         main_loop(conf).await;
     });
 }
 
-/// Key to per timeline per safekeeper data.
-fn timeline_safekeeper_path(
-    broker_etcd_prefix: String,
-    ttid: TenantTimelineId,
-    sk_id: NodeId,
-) -> String {
-    format!(
-        "{}/{sk_id}",
-        SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key()
-    )
-}
-
-async fn push_sk_info(
-    ttid: TenantTimelineId,
-    mut client: Client,
-    key: String,
-    sk_info: SkTimelineInfo,
-    mut lease: Lease,
-) -> anyhow::Result<(TenantTimelineId, Lease)> {
-    let put_opts = PutOptions::new().with_lease(lease.id);
-    client
-        .put(
-            key.clone(),
-            serde_json::to_string(&sk_info)?,
-            Some(put_opts),
-        )
-        .await
-        .with_context(|| format!("failed to push safekeeper info to {}", key))?;
-
-    // revive the lease
-    lease
-        .keeper
-        .keep_alive()
-        .await
-        .context("failed to send LeaseKeepAliveRequest")?;
-    lease
-        .ka_stream
-        .message()
-        .await
-        .context("failed to receive LeaseKeepAliveResponse")?;
-
-    Ok((ttid, lease))
-}
-
-struct Lease {
-    id: i64,
-    keeper: LeaseKeeper,
-    ka_stream: LeaseKeepAliveStream,
-}
-
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let mut client = Client::connect(&conf.broker_endpoints, None).await?;
-    let mut leases: HashMap<TenantTimelineId, Lease> = HashMap::new();
-
+    let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
-    loop {
-        // Note: we lock runtime here and in timeline methods as GlobalTimelines
-        // is under plain mutex. That's ok, all this code is not performance
-        // sensitive and there is no risk of deadlock as we don't await while
-        // lock is held.
-        let mut active_tlis = GlobalTimelines::get_all();
-        active_tlis.retain(|tli| tli.is_active());
 
-        let active_tlis_set: HashSet<TenantTimelineId> =
-            active_tlis.iter().map(|tli| tli.ttid).collect();
-
-        // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data.
-        for tli in &active_tlis {
-            if let Entry::Vacant(v) = leases.entry(tli.ttid) {
-                let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
-                let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?;
-                v.insert(Lease {
-                    id: lease.id(),
-                    keeper,
-                    ka_stream,
-                });
-            }
-        }
-        leases.retain(|ttid, _| active_tlis_set.contains(ttid));
-
-        // Push data concurrently to not suffer from latency, with many timelines it can be slow.
-        let handles = active_tlis
-            .iter()
-            .map(|tli| {
+    let outbound = async_stream::stream! {
+        loop {
+            // Note: we lock runtime here and in timeline methods as GlobalTimelines
+            // is under plain mutex. That's ok, all this code is not performance
+            // sensitive and there is no risk of deadlock as we don't await while
+            // lock is held.
+            let mut active_tlis = GlobalTimelines::get_all();
+            active_tlis.retain(|tli| tli.is_active());
+            for tli in &active_tlis {
                 let sk_info = tli.get_safekeeper_info(&conf);
-                let key =
-                    timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id);
-                let lease = leases.remove(&tli.ttid).unwrap();
-                tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease))
-            })
-            .collect::<Vec<_>>();
-        for h in handles {
-            let (ttid, lease) = h.await??;
-            // It is ugly to pull leases from hash and then put it back, but
-            // otherwise we have to resort to long living per tli tasks (which
-            // would generate a lot of errors when etcd is down) as task wants to
-            // have 'static objects, we can't borrow to it.
-            leases.insert(ttid, lease);
+                yield sk_info;
+            }
+            sleep(push_interval).await;
         }
-
-        sleep(push_interval).await;
-    }
+    };
+    client
+        .publish_safekeeper_info(Request::new(outbound))
+        .await?;
+    Ok(())
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
 async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
-    let mut client = Client::connect(&conf.broker_endpoints, None).await?;
+    let mut client = storage_broker::connect(conf.broker_endpoint)?;
 
-    let mut subscription = etcd_broker::subscribe_for_values(
-        &mut client,
-        SubscriptionKey::all(conf.broker_etcd_prefix.clone()),
-        |full_key, value_str| {
-            if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) {
-                match serde_json::from_str::<SkTimelineInfo>(value_str) {
-                    Ok(new_info) => return Some(new_info),
-                    Err(e) => {
-                        error!("Failed to parse timeline info from value str '{value_str}': {e}")
-                    }
-                }
-            }
-            None
-        },
-    )
-    .await
-    .context("failed to subscribe for safekeeper info")?;
-    loop {
-        match subscription.value_updates.recv().await {
-            Some(new_info) => {
-                // note: there are blocking operations below, but it's considered fine for now
-                if let Ok(tli) = GlobalTimelines::get(new_info.key.id) {
-                    // Note that we also receive *our own* info. That's
-                    // important, as it is used as an indication of live
-                    // connection to the broker.
-                    tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
-                        .await?
-                }
-            }
-            None => {
-                // XXX it means we lost connection with etcd, error is consumed inside sub object
-                debug!("timeline updates sender closed, aborting the pull loop");
-                return Ok(());
-            }
+    // TODO: subscribe only to local timelines instead of all
+    let request = SubscribeSafekeeperInfoRequest {
+        subscription_key: Some(ProtoSubscriptionKey::All(())),
+    };
+
+    let mut stream = client
+        .subscribe_safekeeper_info(request)
+        .await
+        .context("subscribe_safekeper_info request failed")?
+        .into_inner();
+
+    while let Some(msg) = stream.message().await? {
+        let proto_ttid = msg
+            .tenant_timeline_id
+            .as_ref()
+            .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
+        let ttid = parse_proto_ttid(proto_ttid)?;
+        if let Ok(tli) = GlobalTimelines::get(ttid) {
+            // Note that we also receive *our own* info. That's
+            // important, as it is used as an indication of live
+            // connection to the broker.
+
+            // note: there are blocking operations below, but it's considered fine for now
+            tli.record_safekeeper_info(&msg).await?
         }
     }
+    bail!("end of stream");
 }
 
 async fn main_loop(conf: SafeKeeperConf) {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9343611959..a9a9eb3388 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -3,11 +3,14 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_ffi::WAL_SEGMENT_SIZE;
+use safekeeper_api::models::SkTimelineInfo;
 use serde::Serialize;
 use serde::Serializer;
 use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
 use std::sync::Arc;
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::task::JoinError;
 
 use crate::safekeeper::ServerInfo;
@@ -16,7 +19,6 @@ use crate::safekeeper::Term;
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
-use etcd_broker::subscription_value::SkTimelineInfo;
 use utils::{
     auth::JwtAuth,
     http::{
@@ -241,7 +243,22 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         parse_request_param(&request, "timeline_id")?,
     );
     check_permission(&request, Some(ttid.tenant_id))?;
-    let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
+    let sk_info: SkTimelineInfo = json_request(&mut request).await?;
+    let proto_sk_info = SafekeeperTimelineInfo {
+        safekeeper_id: 0,
+        tenant_timeline_id: Some(ProtoTenantTimelineId {
+            tenant_id: ttid.tenant_id.as_ref().to_owned(),
+            timeline_id: ttid.timeline_id.as_ref().to_owned(),
+        }),
+        last_log_term: sk_info.last_log_term.unwrap_or(0),
+        flush_lsn: sk_info.flush_lsn.0,
+        commit_lsn: sk_info.commit_lsn.0,
+        remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
+        peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
+        safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
+        backup_lsn: sk_info.backup_lsn.0,
+        local_start_lsn: sk_info.local_start_lsn.0,
+    };
 
     let tli = GlobalTimelines::get(ttid)
         // `GlobalTimelines::get` returns an error when it can't find the timeline.
@@ -252,7 +269,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
             )
         })
         .map_err(ApiError::NotFound)?;
-    tli.record_safekeeper_info(&safekeeper_info, NodeId(1))
+    tli.record_safekeeper_info(&proto_sk_info)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index fdea284cb7..7261848092 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,11 +1,11 @@
 use defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
 };
+use storage_broker::Uri;
 //
 use remote_storage::RemoteStorageConfig;
 use std::path::PathBuf;
 use std::time::Duration;
-use url::Url;
 
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId},
@@ -62,8 +62,7 @@ pub struct SafeKeeperConf {
     pub backup_runtime_threads: usize,
     pub wal_backup_enabled: bool,
     pub my_id: NodeId,
-    pub broker_endpoints: Vec<Url>,
-    pub broker_etcd_prefix: String,
+    pub broker_endpoint: Uri,
     pub auth_validation_public_key_path: Option<PathBuf>,
     pub heartbeat_timeout: Duration,
     pub max_offloader_lag_bytes: u64,
@@ -93,8 +92,9 @@ impl Default for SafeKeeperConf {
             listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
             remote_storage: None,
             my_id: NodeId(0),
-            broker_endpoints: Vec::new(),
-            broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
+            broker_endpoint: storage_broker::DEFAULT_ENDPOINT
+                .parse()
+                .expect("failed to parse default broker endpoint"),
             backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
             wal_backup_enabled: true,
             auth_validation_public_key_path: None,
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 7dfa6f636e..2c13f81476 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -4,13 +4,13 @@ use anyhow::{bail, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
-use etcd_broker::subscription_value::SkTimelineInfo;
 use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE};
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
+use storage_broker::proto::SafekeeperTimelineInfo;
 
 use tracing::*;
 
@@ -896,39 +896,38 @@ where
     }
 
     /// Update timeline state with peer safekeeper data.
-    pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> {
+    pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
         let mut sync_control_file = false;
-        if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term)
-        {
+
+        if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
-            if last_log_term == self.get_epoch() {
-                self.global_commit_lsn = max(commit_lsn, self.global_commit_lsn);
+            if sk_info.last_log_term == self.get_epoch() {
+                self.global_commit_lsn = max(Lsn(sk_info.commit_lsn), self.global_commit_lsn);
                 self.update_commit_lsn()?;
             }
         }
-        if let Some(backup_lsn) = sk_info.backup_lsn {
-            let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn);
-            sync_control_file |=
-                self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn;
-            self.inmem.backup_lsn = new_backup_lsn;
-        }
-        if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn {
-            let new_remote_consistent_lsn =
-                max(remote_consistent_lsn, self.inmem.remote_consistent_lsn);
-            sync_control_file |= self.state.remote_consistent_lsn
-                + (self.state.server.wal_seg_size as u64)
-                < new_remote_consistent_lsn;
-            self.inmem.remote_consistent_lsn = new_remote_consistent_lsn;
-        }
-        if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn {
-            let new_peer_horizon_lsn = max(peer_horizon_lsn, self.inmem.peer_horizon_lsn);
-            sync_control_file |= self.state.peer_horizon_lsn
-                + (self.state.server.wal_seg_size as u64)
-                < new_peer_horizon_lsn;
-            self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
-        }
+
+        let new_backup_lsn = max(Lsn(sk_info.backup_lsn), self.inmem.backup_lsn);
+        sync_control_file |=
+            self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn;
+        self.inmem.backup_lsn = new_backup_lsn;
+
+        let new_remote_consistent_lsn = max(
+            Lsn(sk_info.remote_consistent_lsn),
+            self.inmem.remote_consistent_lsn,
+        );
+        sync_control_file |= self.state.remote_consistent_lsn
+            + (self.state.server.wal_seg_size as u64)
+            < new_remote_consistent_lsn;
+        self.inmem.remote_consistent_lsn = new_remote_consistent_lsn;
+
+        let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn);
+        sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
+            < new_peer_horizon_lsn;
+        self.inmem.peer_horizon_lsn = new_peer_horizon_lsn;
+
         if sync_control_file {
             self.persist_control_file(self.state.clone())?;
         }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 132a926203..038c32afe0 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -2,7 +2,6 @@
 //! to glue together SafeKeeper and all other background services.
 
 use anyhow::{bail, Result};
-use etcd_broker::subscription_value::SkTimelineInfo;
 use parking_lot::{Mutex, MutexGuard};
 use postgres_ffi::XLogSegNo;
 use pq_proto::ReplicationFeedback;
@@ -18,6 +17,9 @@ use utils::{
     lsn::Lsn,
 };
 
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+
 use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
     SafekeeperMemState, ServerInfo, Term,
@@ -47,13 +49,13 @@ pub struct PeerInfo {
 }
 
 impl PeerInfo {
-    fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo {
+    fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
         PeerInfo {
-            sk_id,
-            _last_log_term: sk_info.last_log_term.unwrap_or(0),
-            _flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID),
-            commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID),
-            local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID),
+            sk_id: NodeId(sk_info.safekeeper_id),
+            _last_log_term: sk_info.last_log_term,
+            _flush_lsn: Lsn(sk_info.flush_lsn),
+            commit_lsn: Lsn(sk_info.commit_lsn),
+            local_start_lsn: Lsn(sk_info.local_start_lsn),
             ts,
         }
     }
@@ -308,21 +310,31 @@ impl SharedState {
         pos
     }
 
-    fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
-        SkTimelineInfo {
-            last_log_term: Some(self.sk.get_epoch()),
-            flush_lsn: Some(self.sk.wal_store.flush_lsn()),
+    fn get_safekeeper_info(
+        &self,
+        ttid: &TenantTimelineId,
+        conf: &SafeKeeperConf,
+    ) -> SafekeeperTimelineInfo {
+        SafekeeperTimelineInfo {
+            safekeeper_id: conf.my_id.0,
+            tenant_timeline_id: Some(ProtoTenantTimelineId {
+                tenant_id: ttid.tenant_id.as_ref().to_owned(),
+                timeline_id: ttid.timeline_id.as_ref().to_owned(),
+            }),
+            last_log_term: self.sk.get_epoch(),
+            flush_lsn: self.sk.wal_store.flush_lsn().0,
             // note: this value is not flushed to control file yet and can be lost
-            commit_lsn: Some(self.sk.inmem.commit_lsn),
+            commit_lsn: self.sk.inmem.commit_lsn.0,
             // TODO: rework feedbacks to avoid max here
-            remote_consistent_lsn: Some(max(
+            remote_consistent_lsn: max(
                 self.get_replicas_state().remote_consistent_lsn,
                 self.sk.inmem.remote_consistent_lsn,
-            )),
-            peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn),
-            safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
-            backup_lsn: Some(self.sk.inmem.backup_lsn),
-            local_start_lsn: Some(self.sk.state.local_start_lsn),
+            )
+            .0,
+            peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
+            safekeeper_connstr: conf.listen_pg_addr.clone(),
+            backup_lsn: self.sk.inmem.backup_lsn.0,
+            local_start_lsn: self.sk.state.local_start_lsn.0,
         }
     }
 }
@@ -682,23 +694,19 @@ impl Timeline {
     }
 
     /// Get safekeeper info for broadcasting to broker and other peers.
-    pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
+    pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
         let shared_state = self.write_shared_state();
-        shared_state.get_safekeeper_info(conf)
+        shared_state.get_safekeeper_info(&self.ttid, conf)
     }
 
     /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(
-        &self,
-        sk_info: &SkTimelineInfo,
-        sk_id: NodeId,
-    ) -> Result<()> {
+    pub async fn record_safekeeper_info(&self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
         let is_wal_backup_action_pending: bool;
         let commit_lsn: Lsn;
         {
             let mut shared_state = self.write_shared_state();
             shared_state.sk.record_safekeeper_info(sk_info)?;
-            let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now());
+            let peer_info = PeerInfo::from_sk_info(sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
             is_wal_backup_action_pending = shared_state.update_status(self.ttid);
             commit_lsn = shared_state.sk.inmem.commit_lsn;
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 843fc53f36..7aa33a5234 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -7,9 +7,11 @@ edition = "2021"
 bench = []
 
 [dependencies]
+anyhow = "1.0"
 async-stream = "0.3"
 bytes = "1.0"
 clap = { version = "4.0", features = ["derive"] }
+const_format = "0.2.21"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -19,7 +21,7 @@ hyper = {version = "0.14.14", features = ["full"]}
 once_cell = "1.13.0"
 parking_lot = "0.12"
 prost = "0.11"
-tonic = "0.8"
+tonic = {version = "0.8", features = ["tls", "tls-roots"]}
 tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] }
 tokio-stream = "0.1"
 tracing = "0.1.27"
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 0a72adc948..73141318b8 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -6,8 +6,8 @@ use clap::Parser;
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
-use storage_broker::BrokerClientChannel;
-use storage_broker::DEFAULT_LISTEN_ADDR;
+
+use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
 use tokio::time;
 
 use tonic::Request;
@@ -88,9 +88,7 @@ fn tli_from_u64(i: u64) -> Vec<u8> {
 async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR))
-            .await
-            .unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
     };
 
     let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
@@ -114,9 +112,7 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
 async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR))
-            .await
-            .unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
     };
     let mut counter: u64 = 0;
 
@@ -156,9 +152,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     let h = tokio::spawn(progress_reporter(counters.clone()));
 
-    let c = BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR))
-        .await
-        .unwrap();
+    let c = storage_broker::connect(DEFAULT_ENDPOINT).unwrap();
 
     for i in 0..args.num_subs {
         let c = Some(c.clone());
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index f25acdfcb3..0629caa2fb 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -2,6 +2,7 @@ use hyper::body::HttpBody;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 use tonic::codegen::StdError;
+use tonic::transport::{ClientTlsConfig, Endpoint};
 use tonic::{transport::Channel, Code, Status};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
@@ -20,12 +21,35 @@ pub mod metrics;
 pub use tonic::Request;
 pub use tonic::Streaming;
 
-pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
+pub use hyper::Uri;
 
-// NeonBrokerClient charged with tonic provided Channel transport; helps to
+pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
+pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
+
+// BrokerServiceClient charged with tonic provided Channel transport; helps to
 // avoid depending on tonic directly in user crates.
 pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 
+// Create connection object configured to run TLS if schema starts with https://
+// and plain text otherwise. Connection is lazy, only endpoint sanity is
+// validated here.
+pub fn connect<U>(endpoint: U) -> anyhow::Result<BrokerClientChannel>
+where
+    U: std::convert::TryInto<Uri>,
+    U::Error: std::error::Error + Send + Sync + 'static,
+{
+    let uri: Uri = endpoint.try_into()?;
+    let mut tonic_endpoint: Endpoint = uri.into();
+    // If schema starts with https, start encrypted connection; do plain text
+    // otherwise.
+    if let Some("https") = tonic_endpoint.uri().scheme_str() {
+        let tls = ClientTlsConfig::new();
+        tonic_endpoint = tonic_endpoint.tls_config(tls)?;
+    }
+    let channel = tonic_endpoint.connect_lazy();
+    Ok(BrokerClientChannel::new(channel))
+}
+
 impl BrokerClientChannel {
     /// Create a new client to the given endpoint, but don't actually connect until the first request.
     pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>
diff --git a/test_runner/README.md b/test_runner/README.md
index e066ac3235..bbb8532b52 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -13,8 +13,6 @@ Prerequisites:
       below to run from other directories.
 - The neon git repo, including the postgres submodule
   (for some tests, e.g. `pg_regress`)
-- Some tests (involving storage nodes coordination) require etcd installed. Follow
-  [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it.
 
 ### Test Organization
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5fbde5e03b..0d64ca6d65 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -33,7 +33,7 @@ from _pytest.config import Config
 from _pytest.fixtures import FixtureRequest
 from fixtures.log_helper import log
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture
+from fixtures.utils import Fn, allure_attach_from_dir, get_self_dir, subprocess_capture
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -281,19 +281,22 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
 
 @pytest.fixture(scope="session")
 def default_broker(
-    request: FixtureRequest, port_distributor: PortDistributor, top_output_dir: Path
-) -> Iterator[Etcd]:
+    request: FixtureRequest,
+    port_distributor: PortDistributor,
+    top_output_dir: Path,
+    neon_binpath: Path,
+) -> Iterator[NeonBroker]:
+    # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
-    # multiple pytest sessions could get launched in parallel, get them different datadirs
-    etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}"
-    etcd_datadir.mkdir(exist_ok=True, parents=True)
-
-    broker = Etcd(
-        datadir=str(etcd_datadir), port=client_port, peer_port=port_distributor.get_port()
+    broker_logfile = (
+        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
     )
+    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
+
+    broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
     broker.stop()
-    allure_attach_from_dir(etcd_datadir)
+    allure_attach_from_dir(Path(broker_logfile))
 
 
 @pytest.fixture(scope="session")
@@ -570,7 +573,7 @@ class NeonEnvBuilder:
         self,
         repo_dir: Path,
         port_distributor: PortDistributor,
-        broker: Etcd,
+        broker: NeonBroker,
         run_id: uuid.UUID,
         mock_s3_server: MockS3Server,
         neon_binpath: Path,
@@ -846,9 +849,8 @@ class NeonEnv:
 
         toml += textwrap.dedent(
             f"""
-            [etcd_broker]
-            broker_endpoints = ['{self.broker.client_url()}']
-            etcd_binary_path = '{self.broker.binary_path}'
+            [broker]
+            listen_addr = '{self.broker.listen_addr()}'
         """
         )
 
@@ -949,7 +951,7 @@ def _shared_simple_env(
     request: FixtureRequest,
     port_distributor: PortDistributor,
     mock_s3_server: MockS3Server,
-    default_broker: Etcd,
+    default_broker: NeonBroker,
     run_id: uuid.UUID,
     top_output_dir: Path,
     neon_binpath: Path,
@@ -1010,7 +1012,7 @@ def neon_env_builder(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: str,
-    default_broker: Etcd,
+    default_broker: NeonBroker,
     run_id: uuid.UUID,
 ) -> Iterator[NeonEnvBuilder]:
     """
@@ -1743,7 +1745,7 @@ class NeonPageserver(PgProtocol):
             # All tests print these, when starting up or shutting down
             ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
             ".*Shutdown task error: walreceiver connection handling failure.*",
-            ".*Etcd client error: grpc request error: status: Unavailable.*",
+            ".*wal_connection_manager.*tcp connect error: Connection refused.*",
             ".*query handler for .* failed: Connection reset by peer.*",
             ".*serving compute connection task.*exited with error: Broken pipe.*",
             ".*Connection aborted: error communicating with the server: Broken pipe.*",
@@ -1834,7 +1836,6 @@ class NeonPageserver(PgProtocol):
 
     def assert_no_errors(self):
         logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
-
         error_or_warn = re.compile("ERROR|WARN")
         errors = []
         while True:
@@ -2653,51 +2654,36 @@ class SafekeeperHttpClient(requests.Session):
 
 
 @dataclass
-class Etcd:
-    """An object managing etcd instance"""
+class NeonBroker:
+    """An object managing storage_broker instance"""
 
-    datadir: str
+    logfile: Path
     port: int
-    peer_port: int
-    binary_path: Path = field(init=False)
+    neon_binpath: Path
     handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
 
-    def __post_init__(self):
-        self.binary_path = etcd_path()
+    def listen_addr(self):
+        return f"127.0.0.1:{self.port}"
 
     def client_url(self):
-        return f"http://127.0.0.1:{self.port}"
+        return f"http://{self.listen_addr()}"
 
     def check_status(self):
-        with requests.Session() as s:
-            s.mount("http://", requests.adapters.HTTPAdapter(max_retries=1))  # do not retry
-            s.get(f"{self.client_url()}/health").raise_for_status()
+        return True  # TODO
 
     def try_start(self):
         if self.handle is not None:
-            log.debug(f"etcd is already running on port {self.port}")
+            log.debug(f"storage_broker is already running on port {self.port}")
             return
 
-        Path(self.datadir).mkdir(exist_ok=True)
-
-        if not self.binary_path.is_file():
-            raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file")
-
-        client_url = self.client_url()
-        log.info(f'Starting etcd to listen incoming connections at "{client_url}"')
-        with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file:
+        listen_addr = self.listen_addr()
+        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
+        with open(self.logfile, "wb") as logfile:
             args = [
-                self.binary_path,
-                f"--data-dir={self.datadir}",
-                f"--listen-client-urls={client_url}",
-                f"--advertise-client-urls={client_url}",
-                f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}",
-                # Set --quota-backend-bytes to keep the etcd virtual memory
-                # size smaller. Our test etcd clusters are very small.
-                # See https://github.com/etcd-io/etcd/issues/7910
-                "--quota-backend-bytes=100000000",
+                self.neon_binpath / "storage_broker",
+                f"--listen-addr={listen_addr}",
             ]
-            self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file)
+            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
 
         # wait for start
         started_at = time.time()
@@ -2707,7 +2693,9 @@ class Etcd:
             except Exception as e:
                 elapsed = time.time() - started_at
                 if elapsed > 5:
-                    raise RuntimeError(f"timed out waiting {elapsed:.0f}s for etcd start: {e}")
+                    raise RuntimeError(
+                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
+                    )
                 time.sleep(0.5)
             else:
                 break  # success
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 506fe6f9da..1fb9eb72e6 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,7 +1,6 @@
 import contextlib
 import os
 import re
-import shutil
 import subprocess
 import tarfile
 import time
@@ -74,13 +73,6 @@ def print_gc_result(row: Dict[str, Any]):
     )
 
 
-def etcd_path() -> Path:
-    path_output = shutil.which("etcd")
-    if path_output is None:
-        raise RuntimeError("etcd not found in PATH")
-    return Path(path_output)
-
-
 def query_scalar(cur: cursor, query: str) -> Any:
     """
     It is a convenience wrapper to avoid repetitions
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 6b3324b7a7..e2822427e9 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -97,17 +97,19 @@ def test_backward_compatibility(
     ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
     compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
 
-    # Copy the snapshot to current directory, and prepare for the test
-    prepare_snapshot(
-        from_dir=compatibility_snapshot_dir,
-        to_dir=test_output_dir / "compatibility_snapshot",
-        port_distributor=port_distributor,
-    )
-
     breaking_changes_allowed = (
         os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
     )
+
     try:
+        # Copy the snapshot to current directory, and prepare for the test
+        prepare_snapshot(
+            from_dir=compatibility_snapshot_dir,
+            to_dir=test_output_dir / "compatibility_snapshot",
+            neon_binpath=neon_binpath,
+            port_distributor=port_distributor,
+        )
+
         check_neon_works(
             test_output_dir / "compatibility_snapshot" / "repo",
             neon_binpath,
@@ -155,18 +157,21 @@ def test_forward_compatibility(
     compatibility_snapshot_dir = (
         test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
     )
-    # Copy the snapshot to current directory, and prepare for the test
-    prepare_snapshot(
-        from_dir=compatibility_snapshot_dir,
-        to_dir=test_output_dir / "compatibility_snapshot",
-        port_distributor=port_distributor,
-        pg_distrib_dir=compatibility_postgres_distrib_dir,
-    )
 
     breaking_changes_allowed = (
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
     )
+
     try:
+        # Copy the snapshot to current directory, and prepare for the test
+        prepare_snapshot(
+            from_dir=compatibility_snapshot_dir,
+            to_dir=test_output_dir / "compatibility_snapshot",
+            port_distributor=port_distributor,
+            neon_binpath=compatibility_neon_bin,
+            pg_distrib_dir=compatibility_postgres_distrib_dir,
+        )
+
         check_neon_works(
             test_output_dir / "compatibility_snapshot" / "repo",
             compatibility_neon_bin,
@@ -194,6 +199,7 @@ def prepare_snapshot(
     from_dir: Path,
     to_dir: Path,
     port_distributor: PortDistributor,
+    neon_binpath: Path,
     pg_distrib_dir: Optional[Path] = None,
 ):
     assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
@@ -227,9 +233,14 @@ def prepare_snapshot(
     pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
         pageserver_config["listen_pg_addr"]
     )
-    pageserver_config["broker_endpoints"] = [
-        port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"]
-    ]
+    # since storage_broker these are overriden by neon_local during pageserver
+    # start; remove both to prevent unknown options during etcd ->
+    # storage_broker migration. TODO: remove once broker is released
+    pageserver_config.pop("broker_endpoint", None)
+    pageserver_config.pop("broker_endpoints", None)
+    etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
+    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
+        pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version
 
     if pg_distrib_dir:
         pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -239,10 +250,22 @@ def prepare_snapshot(
 
     snapshot_config_toml = repo_dir / "config"
     snapshot_config = toml.load(snapshot_config_toml)
-    snapshot_config["etcd_broker"]["broker_endpoints"] = [
-        port_distributor.replace_with_new_port(ep)
-        for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
-    ]
+
+    # Provide up/downgrade etcd <-> storage_broker to make forward/backward
+    # compatibility test happy. TODO: leave only the new part once broker is released.
+    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
+        # old etcd version
+        snapshot_config["etcd_broker"] = {
+            "etcd_binary_path": shutil.which("etcd"),
+            "broker_endpoints": etcd_broker_endpoints,
+        }
+        snapshot_config.pop("broker", None)
+    else:
+        # new storage_broker version
+        broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
+        snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
+        snapshot_config.pop("etcd_broker", None)
+
     snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
         snapshot_config["pageserver"]["listen_http_addr"]
     )
@@ -277,6 +300,12 @@ def prepare_snapshot(
     ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
 
 
+# get git SHA of neon binary
+def get_neon_version(neon_binpath: Path):
+    out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
+    return out.split("git:", 1)[1].rstrip()
+
+
 def check_neon_works(
     repo_dir: Path,
     neon_binpath: Path,
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index c4b3b28f34..081fd0fc2f 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional, Tuple
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    Etcd,
+    NeonBroker,
     NeonEnv,
     NeonEnvBuilder,
     PageserverHttpClient,
@@ -32,7 +32,7 @@ def new_pageserver_service(
     remote_storage_mock_path: Path,
     pg_port: int,
     http_port: int,
-    broker: Optional[Etcd],
+    broker: Optional[NeonBroker],
     pg_distrib_dir: Path,
 ):
     """
@@ -53,7 +53,7 @@ def new_pageserver_service(
     ]
     if broker is not None:
         cmd.append(
-            f"-c broker_endpoints=['{broker.client_url()}']",
+            f"-c broker_endpoint='{broker.client_url()}'",
         )
     pageserver_client = PageserverHttpClient(
         port=http_port,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3945376e5e..3b72aba422 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -16,7 +16,7 @@ from typing import Any, List, Optional
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    Etcd,
+    NeonBroker,
     NeonEnv,
     NeonEnvBuilder,
     NeonPageserver,
@@ -520,7 +520,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
                 )
 
             # advance remote_consistent_lsn to trigger WAL trimming
-            # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates
+            # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push broker updates
             env.safekeepers[0].http_client().record_safekeeper_info(
                 tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end)}
             )
@@ -812,10 +812,10 @@ class SafekeeperEnv:
     ):
         self.repo_dir = repo_dir
         self.port_distributor = port_distributor
-        self.broker = Etcd(
-            datadir=os.path.join(self.repo_dir, "etcd"),
+        self.broker = NeonBroker(
+            logfile=Path(self.repo_dir) / "storage_broker.log",
             port=self.port_distributor.get_port(),
-            peer_port=self.port_distributor.get_port(),
+            neon_binpath=neon_binpath,
         )
         self.pg_bin = pg_bin
         self.num_safekeepers = num_safekeepers
@@ -863,7 +863,7 @@ class SafekeeperEnv:
             str(safekeeper_dir),
             "--id",
             str(i),
-            "--broker-endpoints",
+            "--broker-endpoint",
             self.broker.client_url(),
         ]
         log.info(f'Running command "{" ".join(cmd)}"')
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e50a559a4b..de9a26513d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -32,14 +32,14 @@ nom = { version = "7", features = ["alloc", "std"] }
 num-bigint = { version = "0.4", features = ["std"] }
 num-integer = { version = "0.1", features = ["i128", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm", "std"] }
-prost-93f6ce9d446188ac = { package = "prost", version = "0.10", features = ["prost-derive", "std"] }
-prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["prost-derive", "std"] }
+prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
+socket2 = { version = "0.4", default-features = false, features = ["all"] }
 stable_deref_trait = { version = "1", features = ["alloc", "std"] }
 tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
@@ -59,8 +59,7 @@ libc = { version = "0.2", features = ["extra_traits", "std"] }
 log = { version = "0.4", default-features = false, features = ["serde", "std"] }
 memchr = { version = "2", features = ["std"] }
 nom = { version = "7", features = ["alloc", "std"] }
-prost-93f6ce9d446188ac = { package = "prost", version = "0.10", features = ["prost-derive", "std"] }
-prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["prost-derive", "std"] }
+prost = { version = "0.11", features = ["prost-derive", "std"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }

From a1fd0ba23bba3f4a9342d4fd34707f1afc533561 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 2 Dec 2022 22:15:26 +0400
Subject: [PATCH 050/167] set tag to make proper e2e tests run

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 30ceac1af1..13ffece69d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -449,7 +449,7 @@ jobs:
           --user "${{ secrets.CI_ACCESS_TOKEN }}" \
           --data \
             "{
-              \"ref\": \"main\",
+              \"ref\": \"replace-etcd\",
               \"inputs\": {
                 \"ci_job_name\": \"neon-cloud-e2e\",
                 \"commit_hash\": \"$COMMIT_SHA\",

From 26f4ff949ac623c9ee35f47557749be831b075e4 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Thu, 8 Dec 2022 10:47:50 +0100
Subject: [PATCH 051/167] Add sentry to storage_broker.

---
 .github/workflows/build_and_test.yml     | 6 +++---
 storage_broker/src/bin/storage_broker.rs | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 13ffece69d..080512fa14 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -927,7 +927,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -1012,7 +1012,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1089,7 +1089,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 04f93a1ebb..fdf2637b4d 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -43,6 +43,7 @@ use storage_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR};
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
+use utils::sentry_init::{init_sentry, release_name};
 
 project_git_version!(GIT_VERSION);
 
@@ -417,6 +418,9 @@ async fn http1_handler(
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[]);
+
     let args = Args::parse();
 
     logging::init(LogFormat::from_config(&args.log_format)?)?;

From 0aa2f5c9a5eab3d507c3c468d0902fe4e91de863 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 12 Dec 2022 12:58:55 +0200
Subject: [PATCH 052/167] Regroup CI testing (#3049)

Part of https://github.com/neondatabase/neon/pull/2410 and
https://github.com/neondatabase/neon/pull/2407

* adds `hashFiles('rust-toolchain.toml')` into Rust cache keys, thus
removing one of the manual steps to do when upgrading rustc
* copies Python and Rust style checks from the `codestyle.yml` workflow
* adjusts shell defaults in the main workflow
* replaces `codestyle.yml` with a `neon_extra_builds.yml` worlflow

The new workflow runs on commits to `main` (`codestyle.yml` was run per
PR), and runs two custom builds on GH agents:

* macos-latest, to ensure the entire project compiles on it (no tests
run)

There were no frequent breakages on macOs in our builds, so we can check
it rarely without making every storage PR to wait for it to complete.
The updated mac build use release builds now, so presumably should work
a bit faster due to overall smaller files to cache between builds.

* ubuntu-latest, without caches, to produce full compilation stats for
Rust builds and upload it as an artifact to GitHub

Old `clippy build --timings` stats were collected from the builds that
use caches and incremental calculation hence never could produce a full
report, it got removed.
---
 .github/workflows/build_and_test.yml    | 132 ++++++++++++++-----
 .github/workflows/codestyle.yml         | 166 ------------------------
 .github/workflows/neon_extra_builds.yml | 128 ++++++++++++++++++
 rust-toolchain.toml                     |   2 +-
 4 files changed, 228 insertions(+), 200 deletions(-)
 delete mode 100644 .github/workflows/codestyle.yml
 create mode 100644 .github/workflows/neon_extra_builds.yml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 080512fa14..44b691754a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -7,6 +7,10 @@ on:
       - release
   pull_request:
 
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
 concurrency:
   # Allow only one workflow per any non-`main` branch.
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
@@ -45,6 +49,79 @@ jobs:
         shell: bash
         id: build-tag
 
+  check-codestyle-python:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Cache poetry deps
+        id: cache_poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: Run isort to ensure code format
+        run: poetry run isort --diff --check .
+
+      - name: Run black to ensure code format
+        run: poetry run black --diff --check .
+
+      - name: Run flake8 to ensure code format
+        run: poetry run flake8 .
+
+      - name: Run mypy to check types
+        run: poetry run mypy .
+
+  check-codestyle-rust:
+    runs-on: [ self-hosted, dev, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Restore cargo deps cache
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            !~/.cargo/registry/src
+            ~/.cargo/git/
+            target/
+          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+      - name: Run cargo clippy
+        run: ./run_clippy.sh
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
   build-neon:
     runs-on: [ self-hosted, dev, x64 ]
     container:
@@ -79,12 +156,10 @@ jobs:
       - name: Set pg 14 revision for caching
         id: pg_v14_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-        shell: bash -euxo pipefail {0}
 
       - name: Set pg 15 revision for caching
         id: pg_v15_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-        shell: bash -euxo pipefail {0}
 
       # Set some environment variables used by all the steps.
       #
@@ -101,16 +176,15 @@ jobs:
           if [[ $BUILD_TYPE == "debug" ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
             CARGO_FEATURES="--features testing"
-            CARGO_FLAGS="--locked --timings $CARGO_FEATURES"
+            CARGO_FLAGS="--locked $CARGO_FEATURES"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
             CARGO_FEATURES="--features testing,profiling"
-            CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
+            CARGO_FLAGS="--locked --release $CARGO_FEATURES"
           fi
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
           echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
           echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
-        shell: bash -euxo pipefail {0}
 
       # Don't include the ~/.cargo/registry/src directory. It contains just
       # uncompressed versions of the crates in ~/.cargo/registry/cache
@@ -127,8 +201,8 @@ jobs:
             target/
           # Fall back to older versions of the key, if no cache for current Cargo.lock was found
           key: |
-            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
-            v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
+            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
@@ -147,26 +221,21 @@ jobs:
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
         run: mold -run make postgres-v14 -j$(nproc)
-        shell: bash -euxo pipefail {0}
 
       - name: Build postgres v15
         if: steps.cache_pg_15.outputs.cache-hit != 'true'
         run: mold -run make postgres-v15 -j$(nproc)
-        shell: bash -euxo pipefail {0}
 
       - name: Build neon extensions
         run: mold -run make neon-pg-ext -j$(nproc)
-        shell: bash -euxo pipefail {0}
 
       - name: Run cargo build
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
-        shell: bash -euxo pipefail {0}
 
       - name: Run cargo test
         run: |
           ${cov_prefix} cargo test $CARGO_FLAGS
-        shell: bash -euxo pipefail {0}
 
       - name: Install rust binaries
         run: |
@@ -207,11 +276,9 @@ jobs:
               echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
             done
           fi
-        shell: bash -euxo pipefail {0}
 
       - name: Install postgres binaries
         run: cp -a pg_install /tmp/neon/pg_install
-        shell: bash -euxo pipefail {0}
 
       - name: Upload Neon artifact
         uses: ./.github/actions/upload
@@ -219,17 +286,6 @@ jobs:
           name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
 
-      - name: Prepare cargo build timing stats for storing
-        run: |
-          mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/"
-          cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/"
-        shell: bash -euxo pipefail {0}
-      - name: Upload cargo build stats
-        uses: ./.github/actions/upload
-        with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats
-          path: /tmp/neon/cargo-timings/
-
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug'
@@ -250,7 +306,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: true
-          fetch-depth: 2
+          fetch-depth: 1
 
       - name: Pytest regression tests
         uses: ./.github/actions/run-python-test-set
@@ -284,7 +340,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: true
-          fetch-depth: 2
+          fetch-depth: 1
 
       - name: Pytest benchmarks
         uses: ./.github/actions/run-python-test-set
@@ -330,7 +386,6 @@ jobs:
           SHA: ${{ github.event.pull_request.head.sha || github.sha }}
           REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-        shell: bash -euxo pipefail {0}
         run: |
           curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
           ./scripts/pysync
@@ -363,7 +418,7 @@ jobs:
             !~/.cargo/registry/src
             ~/.cargo/git/
             target/
-          key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       - name: Get Neon artifact
         uses: ./.github/actions/download
@@ -379,7 +434,6 @@ jobs:
 
       - name: Merge coverage data
         run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
-        shell: bash -euxo pipefail {0}
 
       - name: Build and upload coverage report
         run: |
@@ -412,7 +466,6 @@ jobs:
               \"description\": \"Coverage report is ready\",
               \"target_url\": \"$REPORT_URL\"
             }"
-        shell: bash -euxo pipefail {0}
 
   trigger-e2e-tests:
     runs-on: [ self-hosted, dev, x64 ]
@@ -463,6 +516,9 @@ jobs:
     runs-on: [ self-hosted, dev, x64 ]
     needs: [ tag ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    defaults:
+      run:
+        shell: sh -eu {0}
 
     steps:
       - name: Checkout
@@ -481,6 +537,9 @@ jobs:
     runs-on: [ self-hosted, dev, x64 ]
     needs: [ tag ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    defaults:
+      run:
+        shell: sh -eu {0}
 
     steps:
       - name: Checkout
@@ -496,6 +555,10 @@ jobs:
     runs-on: [ self-hosted, dev, x64 ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
     needs: [ tag ]
+    defaults:
+      run:
+        shell: sh -eu {0}
+
     steps:
       - name: Checkout
         uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -513,6 +576,10 @@ jobs:
     runs-on: [ self-hosted, dev, x64 ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
     needs: [ tag ]
+    defaults:
+      run:
+        shell: sh -eu {0}
+
     steps:
       - name: Checkout
         uses: actions/checkout@v1 # v3 won't work with kaniko
@@ -780,7 +847,7 @@ jobs:
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
     needs: [ push-docker-hub, tag, regress-tests ]
     if: |
-      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && 
+      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -1100,7 +1167,6 @@ jobs:
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release
-        shell: bash -euxo pipefail {0}
         env:
           BUCKET: neon-github-public-dev
           PREFIX: artifacts/latest
diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml
deleted file mode 100644
index 01fef71c9a..0000000000
--- a/.github/workflows/codestyle.yml
+++ /dev/null
@@ -1,166 +0,0 @@
-name: Check code style and build
-
-on:
-  push:
-    branches:
-    - main
-  pull_request:
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-env:
-  RUST_BACKTRACE: 1
-  COPT: '-Werror'
-
-jobs:
-  check-codestyle-rust:
-    strategy:
-      fail-fast: false
-      matrix:
-        # XXX: both OSes have rustup
-        #   * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools
-        #   * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
-        # this is all we need to install our toolchain later via rust-toolchain.toml
-        # so don't install any toolchain explicitly.
-        os: [ubuntu-latest, macos-latest]
-    timeout-minutes: 90
-    name: check codestyle rust and postgres
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 2
-
-      - name: Check formatting
-        run: cargo fmt --all -- --check
-
-      - name: Install Ubuntu postgres dependencies
-        if: matrix.os == 'ubuntu-latest'
-        run: |
-          sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler
-
-      - name: Install macOS postgres dependencies
-        if: matrix.os == 'macos-latest'
-        run: brew install flex bison openssl protobuf
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-        shell: bash -euxo pipefail {0}
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-        shell: bash -euxo pipefail {0}
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v3
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v3
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Set extra env for macOS
-        if: matrix.os == 'macos-latest'
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14
-        shell: bash -euxo pipefail {0}
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15
-        shell: bash -euxo pipefail {0}
-
-      - name: Build neon extensions
-        run: make neon-pg-ext
-
-      - name: Cache cargo deps
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry
-            !~/.cargo/registry/src
-            ~/.cargo/git
-            target
-          key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
-
-      - name: Run cargo clippy
-        run: ./run_clippy.sh
-
-      - name: Ensure all project builds
-        run: cargo build --locked --all --all-targets
-
-  check-rust-dependencies:
-    runs-on: [ self-hosted, dev, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
-      - name: Check every project module is covered by Hakari
-        run: |
-          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
-          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-        shell: bash -euxo pipefail {0}
-
-  check-codestyle-python:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      - name: Cache poetry deps
-        id: cache_poetry
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: Run isort to ensure code format
-        run: poetry run isort --diff --check .
-
-      - name: Run black to ensure code format
-        run: poetry run black --diff --check .
-
-      - name: Run flake8 to ensure code format
-        run: poetry run flake8 .
-
-      - name: Run mypy to check types
-        run: poetry run mypy .
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
new file mode 100644
index 0000000000..b8600e0665
--- /dev/null
+++ b/.github/workflows/neon_extra_builds.yml
@@ -0,0 +1,128 @@
+name: Check neon with extra platform builds
+
+on:
+  push:
+    branches:
+    - main
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  check-macos-build:
+    timeout-minutes: 90
+    runs-on: macos-latest
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Install macOS postgres dependencies
+        run: brew install flex bison openssl protobuf
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Set extra env for macOS
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
+      - name: Cache cargo deps
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: make postgres-v15 -j$(nproc)
+
+      - name: Build neon extensions
+        run: make neon-pg-ext -j$(nproc)
+
+      - name: Run cargo build
+        run: cargo build --all --release
+
+      - name: Check that no warnings are produced
+        run: ./run_clippy.sh
+
+  gather-rust-build-stats:
+    timeout-minutes: 90
+    runs-on: ubuntu-latest
+
+    env:
+      BUILD_TYPE: release
+      # build with incremental compilation produce partial results
+      # so do not attempt to cache this build, also disable the incremental compilation
+      CARGO_INCREMENTAL: 0
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Install Ubuntu postgres dependencies
+        run: |
+          sudo apt update
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      - name: Produce the build stats
+        run: cargo build --all --release --timings
+
+      - name: Upload the build stats
+        uses: actions/upload-artifact@v3
+        with:
+          name: neon-${{ runner.os }}-release-build-stats
+          path: ./target/cargo-timings/
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 928a10e555..7ee14a8f41 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -4,7 +4,7 @@
 # version, we can consider updating.
 # See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package,
 # we use "unstable" version number as the highest version used in the project by default.
-channel = "1.62.1" # do update GitHub CI cache values for rust builds, when changing this value
+channel = "1.62.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From f013d53230f81d11684943fcd4b3177ca49f6545 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 8 Dec 2022 18:58:41 +0400
Subject: [PATCH 053/167] Switch to clap derive API in safekeeper.

Less lines and easier to read/modify. Practically no functional changes.
---
 safekeeper/Cargo.toml                  |   2 +-
 safekeeper/src/bin/safekeeper.rs       | 322 ++++++++++---------------
 safekeeper/src/control_file.rs         |   2 +-
 safekeeper/src/lib.rs                  |  41 ++--
 safekeeper/src/timelines_global_map.rs |  35 ++-
 safekeeper/src/wal_backup.rs           |   7 +-
 6 files changed, 171 insertions(+), 238 deletions(-)

diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index daee368b12..d11ef1711a 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -9,7 +9,7 @@ anyhow = "1.0"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-clap = "4.0"
+clap = { version = "4.0", features = ["derive"] }
 const_format = "0.2.21"
 crc32c = "0.6.0"
 fs2 = "0.4.3"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index d6ce5f8ac4..92cd5db203 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -2,16 +2,19 @@
 // Main entry point for the safekeeper executable
 //
 use anyhow::{bail, Context, Result};
-use clap::{value_parser, Arg, ArgAction, Command};
-use const_format::formatcp;
+use clap::Parser;
 use remote_storage::RemoteStorageConfig;
+use toml_edit::Document;
+
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::thread;
+use std::time::Duration;
+use storage_broker::Uri;
 use tokio::sync::mpsc;
-use toml_edit::Document;
+
 use tracing::*;
 use utils::pid_file;
 
@@ -20,7 +23,7 @@ use safekeeper::broker;
 use safekeeper::control_file;
 use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-    DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
+    DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
 use safekeeper::remove_wal;
@@ -44,124 +47,131 @@ const ID_FILE_NAME: &str = "safekeeper.id";
 
 project_git_version!(GIT_VERSION);
 
-fn main() -> anyhow::Result<()> {
-    let arg_matches = cli().get_matches();
+const ABOUT: &str = r#"
+A fleet of safekeepers is responsible for reliably storing WAL received from
+compute, passing it through consensus (mitigating potential computes brain
+split), and serving the hardened part further downstream to pageserver(s).
+"#;
 
-    if let Some(addr) = arg_matches.get_one::<String>("dump-control-file") {
-        let state = control_file::FileStorage::load_control_file(Path::new(addr))?;
+#[derive(Parser)]
+#[command(name = "Neon safekeeper", version = GIT_VERSION, about = ABOUT, long_about = None)]
+struct Args {
+    /// Path to the safekeeper data directory.
+    #[arg(short = 'D', long, default_value = "./")]
+    datadir: PathBuf,
+    /// Safekeeper node id.
+    #[arg(long)]
+    id: Option<u64>,
+    /// Initialize safekeeper with given id and exit.
+    #[arg(long)]
+    init: bool,
+    /// Listen endpoint for receiving/sending WAL in the form host:port.
+    #[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)]
+    listen_pg: String,
+    /// Listen http endpoint for management and metrics in the form host:port.
+    #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
+    listen_http: String,
+    /// Do not wait for changes to be written safely to disk. Unsafe.
+    #[arg(short, long)]
+    no_sync: bool,
+    /// Dump control file at path specified by this argument and exit.
+    #[arg(long)]
+    dump_control_file: Option<PathBuf>,
+    /// Broker endpoint for storage nodes coordination in the form
+    /// http[s]://host:port. In case of https schema TLS is connection is
+    /// established; plaintext otherwise.
+    #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)]
+    broker_endpoint: Uri,
+    /// Peer safekeeper is considered dead after not receiving heartbeats from
+    /// it during this period passed as a human readable duration.
+    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
+    heartbeat_timeout: Duration,
+    /// Remote storage configuration for WAL backup (offloading to s3) as TOML
+    /// inline table, e.g.
+    ///   {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
+    /// Safekeeper offloads WAL to
+    ///   [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring
+    /// structure on the file system.
+    #[arg(long, value_parser = parse_remote_storage, verbatim_doc_comment)]
+    remote_storage: Option<RemoteStorageConfig>,
+    /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
+    #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
+    max_offloader_lag: u64,
+    /// Number of threads for wal backup runtime, by default number of cores
+    /// available to the system.
+    #[arg(long)]
+    wal_backup_threads: Option<usize>,
+    /// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring
+    /// WAL backup horizon.
+    #[arg(long)]
+    disable_wal_backup: bool,
+    /// Path to an RSA .pem public key which is used to check JWT tokens.
+    #[arg(long)]
+    auth_validation_public_key_path: Option<PathBuf>,
+    /// Format for logging, either 'plain' or 'json'.
+    #[arg(long, default_value = "plain")]
+    log_format: String,
+}
+
+fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+
+    if let Some(addr) = args.dump_control_file {
+        let state = control_file::FileStorage::load_control_file(addr)?;
         let json = serde_json::to_string(&state)?;
         print!("{json}");
         return Ok(());
     }
 
-    let mut conf = SafeKeeperConf::default();
+    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    info!("version: {GIT_VERSION}");
 
-    if let Some(dir) = arg_matches.get_one::<PathBuf>("datadir") {
-        // change into the data directory.
-        std::env::set_current_dir(dir)?;
+    // Change into the data directory.
+    std::env::set_current_dir(&args.datadir)?;
+
+    // Set or read our ID.
+    let id = set_id(&args.datadir, args.id.map(NodeId))?;
+    if args.init {
+        return Ok(());
     }
 
-    if arg_matches.get_flag("no-sync") {
-        conf.no_sync = true;
-    }
-
-    if let Some(addr) = arg_matches.get_one::<String>("listen-pg") {
-        conf.listen_pg_addr = addr.to_string();
-    }
-
-    if let Some(addr) = arg_matches.get_one::<String>("listen-http") {
-        conf.listen_http_addr = addr.to_string();
-    }
-
-    let mut given_id = None;
-    if let Some(given_id_str) = arg_matches.get_one::<String>("id") {
-        given_id = Some(NodeId(
-            given_id_str
-                .parse()
-                .context("failed to parse safekeeper id")?,
-        ));
-    }
-
-    if let Some(addr) = arg_matches.get_one::<String>("broker-endpoint") {
-        conf.broker_endpoint = addr.parse().context("failed to parse broker endpoint")?;
-    }
-
-    if let Some(heartbeat_timeout_str) = arg_matches.get_one::<String>("heartbeat-timeout") {
-        conf.heartbeat_timeout =
-            humantime::parse_duration(heartbeat_timeout_str).with_context(|| {
-                format!(
-                    "failed to parse heartbeat-timeout {}",
-                    heartbeat_timeout_str
-                )
-            })?;
-    }
-
-    if let Some(backup_threads) = arg_matches.get_one::<String>("wal-backup-threads") {
-        conf.backup_runtime_threads = backup_threads
-            .parse()
-            .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?;
-    }
-    if let Some(storage_conf) = arg_matches.get_one::<String>("remote-storage") {
-        // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
-        let storage_conf_toml = format!("remote_storage = {}", storage_conf);
-        let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
-        let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
-        conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?);
-    }
-    if let Some(max_offloader_lag_str) = arg_matches.get_one::<String>("max-offloader-lag") {
-        conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| {
-            format!(
-                "failed to parse max offloader lag {}",
-                max_offloader_lag_str
-            )
-        })?;
-    }
-    // Seems like there is no better way to accept bool values explicitly in clap.
-    conf.wal_backup_enabled = arg_matches
-        .get_one::<String>("enable-wal-backup")
-        .unwrap()
-        .parse()
-        .context("failed to parse bool enable-s3-offload bool")?;
-
-    conf.auth_validation_public_key_path = arg_matches
-        .get_one::<String>("auth-validation-public-key-path")
-        .map(PathBuf::from);
-
-    if let Some(log_format) = arg_matches.get_one::<String>("log-format") {
-        conf.log_format = LogFormat::from_config(log_format)?;
-    }
+    let conf = SafeKeeperConf {
+        workdir: args.datadir,
+        my_id: id,
+        listen_pg_addr: args.listen_pg,
+        listen_http_addr: args.listen_http,
+        no_sync: args.no_sync,
+        broker_endpoint: args.broker_endpoint,
+        heartbeat_timeout: args.heartbeat_timeout,
+        remote_storage: args.remote_storage,
+        max_offloader_lag_bytes: args.max_offloader_lag,
+        backup_runtime_threads: args.wal_backup_threads,
+        wal_backup_enabled: !args.disable_wal_backup,
+        auth_validation_public_key_path: args.auth_validation_public_key_path,
+    };
 
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
-    start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
+    start_safekeeper(conf)
 }
 
-fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bool) -> Result<()> {
-    logging::init(conf.log_format)?;
-    info!("version: {GIT_VERSION}");
-
+fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     // Prevent running multiple safekeepers on the same directory
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
     let lock_file =
         pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
-    info!("Claimed pid file at {lock_file_path:?}");
+    info!("claimed pid file at {lock_file_path:?}");
 
     // ensure that the lock file is held even if the main thread of the process is panics
     // we need to release the lock file only when the current process is gone
     std::mem::forget(lock_file);
 
-    // Set or read our ID.
-    set_id(&mut conf, given_id)?;
-    if init {
-        return Ok(());
-    }
-
     let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
         e
     })?;
 
-    info!("Starting safekeeper on {}", conf.listen_pg_addr);
+    info!("starting safekeeper on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
         e
@@ -169,11 +179,11 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
 
     let auth = match conf.auth_validation_public_key_path.as_ref() {
         None => {
-            info!("Auth is disabled");
+            info!("auth is disabled");
             None
         }
         Some(path) => {
-            info!("Loading JWT auth key from {}", path.display());
+            info!("loading JWT auth key from {}", path.display());
             Some(Arc::new(
                 JwtAuth::from_key_path(path).context("failed to load the auth key")?,
             ))
@@ -210,7 +220,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
 
     let conf_cloned = conf.clone();
     let safekeeper_thread = thread::Builder::new()
-        .name("Safekeeper thread".into())
+        .name("safekeeper thread".into())
         .spawn(|| {
             if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
                 info!("safekeeper thread terminated: {e}");
@@ -239,12 +249,11 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
             })?,
     );
 
-    let conf_ = conf.clone();
     threads.push(
         thread::Builder::new()
-            .name("wal backup launcher thread".into())
+            .name("WAL backup launcher thread".into())
             .spawn(move || {
-                wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx);
+                wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
             })?,
     );
 
@@ -263,12 +272,12 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
     })
 }
 
-/// Determine safekeeper id and set it in config.
-fn set_id(conf: &mut SafeKeeperConf, given_id: Option<NodeId>) -> Result<()> {
-    let id_file_path = conf.workdir.join(ID_FILE_NAME);
+/// Determine safekeeper id.
+fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
+    let id_file_path = workdir.join(ID_FILE_NAME);
 
     let my_id: NodeId;
-    // If ID exists, read it in; otherwise set one passed
+    // If file with ID exists, read it in; otherwise set one passed.
     match fs::read(&id_file_path) {
         Ok(id_serialized) => {
             my_id = NodeId(
@@ -298,110 +307,27 @@ fn set_id(conf: &mut SafeKeeperConf, given_id: Option<NodeId>) -> Result<()> {
                 let mut f = File::create(&id_file_path)?;
                 f.write_all(my_id.to_string().as_bytes())?;
                 f.sync_all()?;
-                info!("initialized safekeeper ID {}", my_id);
+                info!("initialized safekeeper id {}", my_id);
             }
             _ => {
                 return Err(error.into());
             }
         },
     }
-    conf.my_id = my_id;
-    Ok(())
+    Ok(my_id)
 }
 
-fn cli() -> Command {
-    Command::new("Neon safekeeper")
-        .about("Store WAL stream to local file system and push it to WAL receivers")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("datadir")
-                .short('D')
-                .long("dir")
-                .value_parser(value_parser!(PathBuf))
-                .help("Path to the safekeeper data directory"),
-        )
-        .arg(
-            Arg::new("init")
-                .long("init")
-                .action(ArgAction::SetTrue)
-                .help("Initialize safekeeper with ID"),
-        )
-        .arg(
-            Arg::new("listen-pg")
-                .short('l')
-                .long("listen-pg")
-                .alias("listen") // for compatibility
-                .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
-        )
-        .arg(
-            Arg::new("listen-http")
-                .long("listen-http")
-                .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
-        )
-        // FIXME this argument is no longer needed since pageserver address is forwarded from compute.
-        // However because this argument is in use by console's e2e tests let's keep it for now and remove separately.
-        // So currently it is a noop.
-        .arg(
-            Arg::new("pageserver")
-                .short('p')
-                .long("pageserver"),
-        )
-        .arg(
-            Arg::new("no-sync")
-                .short('n')
-                .long("no-sync")
-                .action(ArgAction::SetTrue)
-                .help("Do not wait for changes to be written safely to disk"),
-        )
-        .arg(
-            Arg::new("dump-control-file")
-                .long("dump-control-file")
-                .help("Dump control file at path specified by this argument and exit"),
-        )
-        .arg(
-            Arg::new("id").long("id").help("safekeeper node id: integer")
-        ).arg(
-            Arg::new("broker-endpoint")
-            .long("broker-endpoint")
-            .help(formatcp!("Broker endpoint for storage nodes coordination in the form http[s]://host:port, default '{DEFAULT_ENDPOINT}'. In case of https schema TLS is connection is established; plaintext otherwise.")),
-        )
-        .arg(
-            Arg::new("heartbeat-timeout")
-                .long("heartbeat-timeout")
-                .help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs()))
-        )
-        .arg(
-            Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")),
-        ).arg(
-            Arg::new("remote-storage")
-                .long("remote-storage")
-                .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"<BUCKETNAME>\", \"bucket_region\":\"<REGION>\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring structure on the file system.")
-        )
-        .arg(
-            Arg::new("max-offloader-lag")
-                .long("max-offloader-lag")
-                .help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20)))
-        )
-        .arg(
-            Arg::new("enable-wal-backup")
-                .long("enable-wal-backup")
-                .default_value("true")
-                .default_missing_value("true")
-                .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."),
-        )
-        .arg(
-            Arg::new("auth-validation-public-key-path")
-                .long("auth-validation-public-key-path")
-                .help("Path to an RSA .pem public key which is used to check JWT tokens")
-        )
-        .arg(
-            Arg::new("log-format")
-                .long("log-format")
-                .help("Format for logging, either 'plain' or 'json'")
-        )
+// Parse RemoteStorage from TOML table.
+fn parse_remote_storage(storage_conf: &str) -> Result<RemoteStorageConfig> {
+    // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
+    let storage_conf_toml = format!("remote_storage = {}", storage_conf);
+    let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
+    let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
+    RemoteStorageConfig::from_toml(storage_conf_parsed_toml)
 }
 
 #[test]
 fn verify_cli() {
-    cli().debug_assert();
+    use clap::CommandFactory;
+    Args::command().debug_assert()
 }
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 6be3f9abb2..f4a0f8520c 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -231,7 +231,7 @@ mod test {
         let workdir = tempfile::tempdir().unwrap().into_path();
         SafeKeeperConf {
             workdir,
-            ..Default::default()
+            ..SafeKeeperConf::dummy()
         }
     }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 7261848092..60a1911068 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,16 +1,10 @@
-use defaults::{
-    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
-};
 use storage_broker::Uri;
 //
 use remote_storage::RemoteStorageConfig;
 use std::path::PathBuf;
 use std::time::Duration;
 
-use utils::{
-    id::{NodeId, TenantId, TenantTimelineId},
-    logging::LogFormat,
-};
+use utils::id::{NodeId, TenantId, TenantTimelineId};
 
 mod auth;
 pub mod broker;
@@ -33,15 +27,13 @@ mod timelines_global_map;
 pub use timelines_global_map::GlobalTimelines;
 
 pub mod defaults {
-    use std::time::Duration;
-
     pub use safekeeper_api::{
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
 
     pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
-    pub const DEFAULT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(5);
+    pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
     pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
 }
 
@@ -54,19 +46,17 @@ pub struct SafeKeeperConf {
     // to the process but different unit tests work on different
     // data directories to avoid clashing with each other.
     pub workdir: PathBuf,
-
-    pub no_sync: bool,
+    pub my_id: NodeId,
     pub listen_pg_addr: String,
     pub listen_http_addr: String,
-    pub remote_storage: Option<RemoteStorageConfig>,
-    pub backup_runtime_threads: usize,
-    pub wal_backup_enabled: bool,
-    pub my_id: NodeId,
+    pub no_sync: bool,
     pub broker_endpoint: Uri,
-    pub auth_validation_public_key_path: Option<PathBuf>,
     pub heartbeat_timeout: Duration,
+    pub remote_storage: Option<RemoteStorageConfig>,
     pub max_offloader_lag_bytes: u64,
-    pub log_format: LogFormat,
+    pub backup_runtime_threads: Option<usize>,
+    pub wal_backup_enabled: bool,
+    pub auth_validation_public_key_path: Option<PathBuf>,
 }
 
 impl SafeKeeperConf {
@@ -80,12 +70,10 @@ impl SafeKeeperConf {
     }
 }
 
-impl Default for SafeKeeperConf {
-    fn default() -> Self {
+impl SafeKeeperConf {
+    #[cfg(test)]
+    fn dummy() -> Self {
         SafeKeeperConf {
-            // Always set to './'. We will chdir into the directory specified on the
-            // command line, so that when the server is running, all paths are relative
-            // to that.
             workdir: PathBuf::from("./"),
             no_sync: false,
             listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
@@ -95,12 +83,11 @@ impl Default for SafeKeeperConf {
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint"),
-            backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS,
+            backup_runtime_threads: None,
             wal_backup_enabled: true,
             auth_validation_public_key_path: None,
-            heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT,
-            max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-            log_format: LogFormat::Plain,
+            heartbeat_timeout: Duration::new(5, 0),
+            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
         }
     }
 }
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index a5d373a1da..fd5f010b3d 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -20,14 +20,21 @@ use utils::lsn::Lsn;
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
     wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
-    conf: SafeKeeperConf,
+    conf: Option<SafeKeeperConf>,
 }
 
 impl GlobalTimelinesState {
+    /// Get configuration, which must be set once during init.
+    fn get_conf(&self) -> &SafeKeeperConf {
+        self.conf
+            .as_ref()
+            .expect("GlobalTimelinesState conf is not initialized")
+    }
+
     /// Get dependencies for a timeline constructor.
     fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
         (
-            self.conf.clone(),
+            self.get_conf().clone(),
             self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
         )
     }
@@ -55,7 +62,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
         wal_backup_launcher_tx: None,
-        conf: SafeKeeperConf::default(),
+        conf: None,
     })
 });
 
@@ -71,12 +78,12 @@ impl GlobalTimelines {
         let mut state = TIMELINES_STATE.lock().unwrap();
         assert!(state.wal_backup_launcher_tx.is_none());
         state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
-        state.conf = conf;
+        state.conf = Some(conf);
 
         // Iterate through all directories and load tenants for all directories
         // named as a valid tenant_id.
         let mut tenant_count = 0;
-        let tenants_dir = state.conf.workdir.clone();
+        let tenants_dir = state.get_conf().workdir.clone();
         for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
             .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))?
         {
@@ -111,7 +118,7 @@ impl GlobalTimelines {
         state: &mut MutexGuard<GlobalTimelinesState>,
         tenant_id: TenantId,
     ) -> Result<()> {
-        let timelines_dir = state.conf.tenant_dir(&tenant_id);
+        let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
         for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
             .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))?
         {
@@ -122,7 +129,7 @@ impl GlobalTimelines {
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
                         match Timeline::load_timeline(
-                            state.conf.clone(),
+                            state.get_conf().clone(),
                             ttid,
                             state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
                         ) {
@@ -281,7 +288,11 @@ impl GlobalTimelines {
             }
             Err(_) => {
                 // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid);
+                let dir_path = TIMELINES_STATE
+                    .lock()
+                    .unwrap()
+                    .get_conf()
+                    .timeline_dir(ttid);
                 let dir_existed = delete_dir(dir_path)?;
 
                 Ok(TimelineDeleteForceResult {
@@ -327,7 +338,13 @@ impl GlobalTimelines {
         // Note that we could concurrently create new timelines while we were deleting them,
         // so the directory may be not empty. In this case timelines will have bad state
         // and timeline background jobs can panic.
-        delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?;
+        delete_dir(
+            TIMELINES_STATE
+                .lock()
+                .unwrap()
+                .get_conf()
+                .tenant_dir(tenant_id),
+        )?;
 
         let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
         if !tlis_after_delete.is_empty() {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 300e9a1cba..ae4d4cce09 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -37,8 +37,11 @@ pub fn wal_backup_launcher_thread_main(
     conf: SafeKeeperConf,
     wal_backup_launcher_rx: Receiver<TenantTimelineId>,
 ) {
-    let rt = Builder::new_multi_thread()
-        .worker_threads(conf.backup_runtime_threads)
+    let mut builder = Builder::new_multi_thread();
+    if let Some(num_threads) = conf.backup_runtime_threads {
+        builder.worker_threads(num_threads);
+    }
+    let rt = builder
         .enable_all()
         .build()
         .expect("failed to create wal backup runtime");

From d1edc8aa00eeebbc20a5839c614c7f4c541f4b97 Mon Sep 17 00:00:00 2001
From: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Date: Mon, 12 Dec 2022 16:55:40 +0100
Subject: [PATCH 054/167] Deprecate old runner for deploy job (#3070)

As we plan to no longer use them

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
---
 .github/workflows/build_and_test.yml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 44b691754a..79a95858b7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -50,7 +50,11 @@ jobs:
         id: build-tag
 
   check-codestyle-python:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: [ self-hosted, dev, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
+      options: --init
+
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -725,7 +729,7 @@ jobs:
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
   calculate-deploy-targets:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: [ self-hosted, dev, x64 ]
     if: |
       (github.ref_name == 'main' || github.ref_name == 'release') &&
       github.event_name != 'workflow_dispatch'
@@ -747,8 +751,8 @@ jobs:
           fi
 
   deploy:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -915,7 +919,7 @@ jobs:
 
   deploy-proxy:
     runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
@@ -958,7 +962,7 @@ jobs:
   deploy-storage-broker:
     name: deploy storage broker on old staging and old prod
     runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |

From 22ae67af8d9b939f736e576fc0100d03354f7712 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 13 Dec 2022 00:27:59 +0100
Subject: [PATCH 055/167] refactor: use new type LayerFileName when referring
 to layer file names in PathBuf/RemotePath (#3026)

refactor: use new type LayerFileName when referring to layer file names in PathBuf/RemotePath

Before this patch, we would sometimes carry around plain file names in
`Path` types and/or awkwardly "rebase" paths to have a unified
representation of the layer file name between local and remote.

This patch introduces a new type `LayerFileName` which replaces the use
of `Path` / `PathBuf` / `RemotePath` in the `storage_sync2` APIs.

Instead of holding a string, it contains the parsed representation of
the image and delta file name.
When we need the file name, e.g., to construct a local path or
remote object key, we construct the name ad-hoc.

`LayerFileName` is also serde {Dese,Se}rializable, and in an initial
version of this patch, it was supposed to be used directly inside
`IndexPart`, replacing `RemotePath`.
However,
  commit 3122f3282f1a7f4639141f5a5a451cefae53a43d
      Ignore backup files (ones with .n.old suffix) in download_missing
fixed handling of `*.old` backup file names in IndexPart, and we need
to carry that behavior forward.
The solution is to remove `*.old` backup files names during
deserialization. When we re-serialize the IndexPart, the `*.old` file
will be gone.
This leaks the `.old` file in the remote storage, but makes it safe
to clean it up later.

There is additional churn by a preliminary refactoring that got squashed
into this change:

   split off LayerMap's needs from trait Layer into super trait

That refactoring renames `Layer` to `PersistentLayer` and splits off a subset
of the functions into a super-trait called `Layer`.
The upser trait implements just the functions needed by `LayerMap`, whereas
`PersisentLayer` adds the context of the pageserver.

The naming is imperfect as some functions that reside in `PersistentLayer`
have nothing persistence-specific to it. But it's a step in the right direction.
---
 libs/remote_storage/src/lib.rs                |   2 +-
 pageserver/benches/bench_layer_map.rs         |  92 ++----
 pageserver/src/storage_sync2.rs               | 116 ++++---
 pageserver/src/storage_sync2/download.rs      |  27 +-
 pageserver/src/storage_sync2/index.rs         | 183 ++++++++---
 pageserver/src/tenant.rs                      |   3 +-
 pageserver/src/tenant/block_io.rs             |   2 +-
 pageserver/src/tenant/delta_layer.rs          | 203 +++++++------
 pageserver/src/tenant/filename.rs             | 102 ++++++-
 pageserver/src/tenant/image_layer.rs          | 139 ++++-----
 pageserver/src/tenant/inmemory_layer.rs       | 151 ++++------
 pageserver/src/tenant/layer_map.rs            |  71 +++--
 pageserver/src/tenant/storage_layer.rs        |  76 ++---
 pageserver/src/tenant/timeline.rs             | 283 +++++++++---------
 .../test_tenants_with_remote_storage.py       |   6 +-
 15 files changed, 804 insertions(+), 652 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index f72689884e..04335d8f2f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -142,7 +142,7 @@ impl std::fmt::Display for DownloadError {
                 write!(f, "Failed to download a remote file due to user input: {e}")
             }
             DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
         }
     }
 }
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index a99580bc65..6001377811 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,10 +1,9 @@
 use anyhow::Result;
-use pageserver::repository::{Key, Value};
+use pageserver::repository::Key;
 use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::Layer;
-use pageserver::tenant::storage_layer::ValueReconstructResult;
 use pageserver::tenant::storage_layer::ValueReconstructState;
+use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -14,7 +13,7 @@ use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Instant;
-use utils::id::{TenantId, TimelineId};
+
 use utils::lsn::Lsn;
 
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -25,14 +24,6 @@ struct DummyDelta {
 }
 
 impl Layer for DummyDelta {
-    fn get_tenant_id(&self) -> TenantId {
-        TenantId::from_str("00000000000000000000000000000000").unwrap()
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        TimelineId::from_str("00000000000000000000000000000000").unwrap()
-    }
-
     fn get_key_range(&self) -> Range<Key> {
         self.key_range.clone()
     }
@@ -40,15 +31,6 @@ impl Layer for DummyDelta {
     fn get_lsn_range(&self) -> Range<Lsn> {
         self.lsn_range.clone()
     }
-
-    fn filename(&self) -> PathBuf {
-        todo!()
-    }
-
-    fn local_path(&self) -> Option<PathBuf> {
-        todo!()
-    }
-
     fn get_value_reconstruct_data(
         &self,
         _key: Key,
@@ -62,24 +44,12 @@ impl Layer for DummyDelta {
         true
     }
 
-    fn is_in_memory(&self) -> bool {
-        false
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
-        panic!()
-    }
-
-    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
-        panic!("Not implemented")
-    }
-
-    fn delete(&self) -> Result<()> {
-        panic!()
-    }
-
     fn dump(&self, _verbose: bool) -> Result<()> {
-        todo!()
+        unimplemented!()
+    }
+
+    fn short_id(&self) -> String {
+        unimplemented!()
     }
 }
 
@@ -89,14 +59,6 @@ struct DummyImage {
 }
 
 impl Layer for DummyImage {
-    fn get_tenant_id(&self) -> TenantId {
-        TenantId::from_str("00000000000000000000000000000000").unwrap()
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        TimelineId::from_str("00000000000000000000000000000000").unwrap()
-    }
-
     fn get_key_range(&self) -> Range<Key> {
         self.key_range.clone()
     }
@@ -106,14 +68,6 @@ impl Layer for DummyImage {
         self.lsn..(self.lsn + 1)
     }
 
-    fn filename(&self) -> PathBuf {
-        todo!()
-    }
-
-    fn local_path(&self) -> Option<PathBuf> {
-        todo!()
-    }
-
     fn get_value_reconstruct_data(
         &self,
         _key: Key,
@@ -127,29 +81,17 @@ impl Layer for DummyImage {
         false
     }
 
-    fn is_in_memory(&self) -> bool {
-        false
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
-        panic!()
-    }
-
-    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
-        panic!("Not implemented")
-    }
-
-    fn delete(&self) -> Result<()> {
-        panic!()
-    }
-
     fn dump(&self, _verbose: bool) -> Result<()> {
-        todo!()
+        unimplemented!()
+    }
+
+    fn short_id(&self) -> String {
+        unimplemented!()
     }
 }
 
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
-    let mut layer_map = LayerMap::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
+    let mut layer_map = LayerMap::<dyn Layer>::default();
 
     let mut min_lsn = Lsn(u64::MAX);
     let mut max_lsn = Lsn(0);
@@ -185,7 +127,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
 }
 
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
     // For each image layer we query one of the pages contained, at LSN right
     // before the image layer was created. This gives us a somewhat uniform
     // coverage of both the lsn and key space because image layers have
@@ -258,7 +200,7 @@ fn bench_from_real_project(c: &mut Criterion) {
 
 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_sequential(c: &mut Criterion) {
-    let mut layer_map = LayerMap::default();
+    let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
 
     // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
     //
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index b5c5a0d25d..7cc0eac2bf 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -197,12 +197,11 @@ pub use download::{is_temp_download_file, list_remote_timelines};
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 use std::ops::DerefMut;
-use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
 use anyhow::ensure;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use tokio::runtime::Runtime;
 use tracing::{info, warn};
 use tracing::{info_span, Instrument};
@@ -215,6 +214,7 @@ use crate::metrics::MeasureRemoteOp;
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
 use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
+use crate::tenant::filename::LayerFileName;
 use crate::{
     config::PageServerConf,
     storage_sync::index::LayerFileMetadata,
@@ -287,7 +287,7 @@ struct UploadQueueInitialized {
 
     /// All layer files stored in the remote storage, taking into account all
     /// in-progress and queued operations
-    latest_files: HashMap<RemotePath, LayerFileMetadata>,
+    latest_files: HashMap<LayerFileName, LayerFileMetadata>,
 
     /// Metadata stored in the remote storage, taking into account all
     /// in-progress and queued operations.
@@ -357,10 +357,6 @@ impl UploadQueue {
 
     fn initialize_with_current_remote_index_part(
         &mut self,
-        conf: &'static PageServerConf,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-
         index_part: &IndexPart,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
@@ -371,18 +367,13 @@ impl UploadQueue {
         }
 
         let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
-        for timeline_name in &index_part.timeline_layers {
-            let local_path = timeline_path.join(timeline_name);
-            let remote_timeline_path = conf.remote_path(&local_path).expect(
-                "Remote timeline path and local timeline path were constructed form the same conf",
-            );
+        for layer_name in &index_part.timeline_layers {
             let layer_metadata = index_part
                 .layer_metadata
-                .get(timeline_name)
+                .get(layer_name)
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(remote_timeline_path, layer_metadata);
+            files.insert(layer_name.to_owned(), layer_metadata);
         }
 
         let index_part_metadata = index_part.parse_metadata()?;
@@ -431,13 +422,13 @@ struct UploadTask {
 #[derive(Debug)]
 enum UploadOp {
     /// Upload a layer file
-    UploadLayer(PathBuf, LayerFileMetadata),
+    UploadLayer(LayerFileName, LayerFileMetadata),
 
     /// Upload the metadata file
     UploadMetadata(IndexPart, Lsn),
 
     /// Delete a file.
-    Delete(RemoteOpFileKind, PathBuf),
+    Delete(RemoteOpFileKind, LayerFileName),
 
     /// Barrier. When the barrier operation is reached,
     Barrier(tokio::sync::watch::Sender<()>),
@@ -446,14 +437,16 @@ enum UploadOp {
 impl std::fmt::Display for UploadOp {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
-            UploadOp::UploadLayer(path, metadata) => write!(
-                f,
-                "UploadLayer({}, size={:?})",
-                path.display(),
-                metadata.file_size()
-            ),
+            UploadOp::UploadLayer(path, metadata) => {
+                write!(
+                    f,
+                    "UploadLayer({}, size={:?})",
+                    path.file_name(),
+                    metadata.file_size()
+                )
+            }
             UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.display()),
+            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
             UploadOp::Barrier(_) => write!(f, "Barrier"),
         }
     }
@@ -465,12 +458,7 @@ impl RemoteTimelineClient {
     /// The given `index_part` must be the one on the remote.
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(
-            self.conf,
-            self.tenant_id,
-            self.timeline_id,
-            index_part,
-        )?;
+        upload_queue.initialize_with_current_remote_index_part(index_part)?;
         Ok(())
     }
 
@@ -524,13 +512,15 @@ impl RemoteTimelineClient {
     /// On success, returns the size of the downloaded file.
     pub async fn download_layer_file(
         &self,
-        remote_path: &RemotePath,
+        layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
         let downloaded_size = download::download_layer_file(
             self.conf,
             &self.storage_impl,
-            remote_path,
+            self.tenant_id,
+            self.timeline_id,
+            layer_file_name,
             layer_metadata,
         )
         .measure_remote_op(
@@ -548,13 +538,13 @@ impl RemoteTimelineClient {
             let new_metadata = LayerFileMetadata::new(downloaded_size);
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
-            if let Some(upgraded) = upload_queue.latest_files.get_mut(remote_path) {
+            if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
                 upgraded.merge(&new_metadata);
             } else {
                 // The file should exist, since we just downloaded it.
                 warn!(
                     "downloaded file {:?} not found in local copy of the index file",
-                    remote_path
+                    layer_file_name
                 );
             }
         }
@@ -611,7 +601,7 @@ impl RemoteTimelineClient {
     ///
     pub fn schedule_layer_file_upload(
         self: &Arc<Self>,
-        path: &Path,
+        layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
@@ -626,13 +616,16 @@ impl RemoteTimelineClient {
 
         upload_queue
             .latest_files
-            .insert(self.conf.remote_path(path)?, layer_metadata.clone());
+            .insert(layer_file_name.clone(), layer_metadata.clone());
 
-        let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
+        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
         self.update_upload_queue_unfinished_metric(1, &op);
         upload_queue.queued_operations.push_back(op);
 
-        info!("scheduled layer file upload {}", path.display());
+        info!(
+            "scheduled layer file upload {}",
+            layer_file_name.file_name()
+        );
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
@@ -644,16 +637,13 @@ impl RemoteTimelineClient {
     ///
     /// The deletion won't actually be performed, until all preceding
     /// upload operations have completed succesfully.
-    pub fn schedule_layer_file_deletion(self: &Arc<Self>, paths: &[PathBuf]) -> anyhow::Result<()> {
+    pub fn schedule_layer_file_deletion(
+        self: &Arc<Self>,
+        names: &[LayerFileName],
+    ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        // Convert the paths into RemotePaths, and gather other information we need.
-        let mut remote_paths = Vec::with_capacity(paths.len());
-        for path in paths {
-            remote_paths.push(self.conf.remote_path(path)?);
-        }
-
         // Deleting layers doesn't affect the values stored in TimelineMetadata,
         // so we don't need update it. Just serialize it.
         let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
@@ -667,8 +657,8 @@ impl RemoteTimelineClient {
         // from latest_files, but not yet scheduled for deletion. Use a closure
         // to syntactically forbid ? or bail! calls here.
         let no_bail_here = || {
-            for remote_path in remote_paths {
-                upload_queue.latest_files.remove(&remote_path);
+            for name in names {
+                upload_queue.latest_files.remove(name);
             }
 
             let index_part = IndexPart::new(
@@ -681,11 +671,11 @@ impl RemoteTimelineClient {
             upload_queue.queued_operations.push_back(op);
 
             // schedule the actual deletions
-            for path in paths {
-                let op = UploadOp::Delete(RemoteOpFileKind::Layer, PathBuf::from(path));
+            for name in names {
+                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
                 self.update_upload_queue_unfinished_metric(1, &op);
                 upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {}", path.display());
+                info!("scheduled layer file deletion {}", name.file_name());
             }
 
             // Launch the tasks immediately, if possible
@@ -841,7 +831,11 @@ impl RemoteTimelineClient {
             }
 
             let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref path, ref layer_metadata) => {
+                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
+                    let path = &self
+                        .conf
+                        .timeline_path(&self.timeline_id, &self.tenant_id)
+                        .join(layer_file_name.file_name());
                     upload::upload_timeline_layer(
                         self.conf,
                         &self.storage_impl,
@@ -872,7 +866,11 @@ impl RemoteTimelineClient {
                     )
                     .await
                 }
-                UploadOp::Delete(metric_file_kind, ref path) => {
+                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
+                    let path = &self
+                        .conf
+                        .timeline_path(&self.timeline_id, &self.tenant_id)
+                        .join(layer_file_name.file_name());
                     delete::delete_layer(self.conf, &self.storage_impl, path)
                         .measure_remote_op(
                             self.tenant_id,
@@ -1078,7 +1076,7 @@ mod tests {
     use super::*;
     use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::collections::HashSet;
+    use std::{collections::HashSet, path::Path};
     use utils::lsn::Lsn;
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1102,8 +1100,8 @@ mod tests {
         TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
     }
 
-    fn assert_file_list(a: &HashSet<String>, b: &[&str]) {
-        let mut avec: Vec<&str> = a.iter().map(|a| a.as_str()).collect();
+    fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
+        let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
         avec.sort();
 
         let mut bvec = b.to_vec();
@@ -1198,11 +1196,11 @@ mod tests {
         std::fs::write(timeline_path.join("bar"), &content_bar)?;
 
         client.schedule_layer_file_upload(
-            &timeline_path.join("foo"),
+            &LayerFileName::Test("foo".to_owned()),
             &LayerFileMetadata::new(content_foo.len() as u64),
         )?;
         client.schedule_layer_file_upload(
-            &timeline_path.join("bar"),
+            &LayerFileName::Test("bar".to_owned()),
             &LayerFileMetadata::new(content_bar.len() as u64),
         )?;
 
@@ -1244,10 +1242,10 @@ mod tests {
         let content_baz = dummy_contents("baz");
         std::fs::write(timeline_path.join("baz"), &content_baz)?;
         client.schedule_layer_file_upload(
-            &timeline_path.join("baz"),
+            &LayerFileName::Test("baz".to_owned()),
             &LayerFileMetadata::new(content_baz.len() as u64),
         )?;
-        client.schedule_layer_file_deletion(&[timeline_path.join("foo")])?;
+        client.schedule_layer_file_deletion(&[LayerFileName::Test("foo".to_owned())])?;
         {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index 18a6ac0179..0d25d88a97 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -6,15 +6,16 @@ use anyhow::{bail, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::debug;
+use tracing::{debug, info_span, Instrument};
 
 use crate::config::PageServerConf;
 use crate::storage_sync::index::LayerFileMetadata;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use crate::tenant::filename::LayerFileName;
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
-use super::index::IndexPart;
+use super::index::{IndexPart, IndexPartUnclean};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -28,10 +29,16 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
 pub async fn download_layer_file<'a>(
     conf: &'static PageServerConf,
     storage: &'a GenericRemoteStorage,
-    remote_path: &'a RemotePath,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
 ) -> anyhow::Result<u64> {
-    let local_path = conf.local_path(remote_path);
+    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
+
+    let local_path = timeline_path.join(layer_file_name.file_name());
+
+    let remote_path = conf.remote_path(&local_path)?;
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
@@ -52,7 +59,7 @@ pub async fn download_layer_file<'a>(
             temp_file_path.display()
         )
     })?;
-    let mut download = storage.download(remote_path).await.with_context(|| {
+    let mut download = storage.download(&remote_path).await.with_context(|| {
         format!(
             "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
         )
@@ -169,7 +176,9 @@ pub async fn list_remote_timelines<'a>(
             part_downloads.push(async move {
                 (
                     timeline_id,
-                    download_index_part(conf, &storage_clone, tenant_id, timeline_id).await,
+                    download_index_part(conf, &storage_clone, tenant_id, timeline_id)
+                        .instrument(info_span!("download_index_part", timeline=%timeline_id))
+                        .await,
                 )
             });
         }
@@ -211,11 +220,13 @@ pub async fn download_index_part(
     .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
     .map_err(DownloadError::Other)?;
 
-    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
+    let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
         .with_context(|| {
             format!("Failed to deserialize index part file into file {index_part_path:?}")
         })
         .map_err(DownloadError::Other)?;
 
+    let index_part = index_part.remove_unclean_layer_file_names();
+
     Ok(index_part)
 }
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index 5560712a1b..ce9a43ed3b 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -4,11 +4,11 @@
 
 use std::collections::{HashMap, HashSet};
 
-use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use tracing::warn;
 
-use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata};
 
 use utils::lsn::Lsn;
 
@@ -62,7 +62,10 @@ impl LayerFileMetadata {
 /// remember to add a test case for the changed version.
 #[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexPart {
+pub struct IndexPartImpl<L>
+where
+    L: std::hash::Hash + PartialEq + Eq,
+{
     /// Debugging aid describing the version of this type.
     #[serde(default)]
     version: usize,
@@ -70,19 +73,19 @@ pub struct IndexPart {
     /// Layer names, which are stored on the remote storage.
     ///
     /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<String>,
+    pub timeline_layers: HashSet<L>,
 
     /// FIXME: unused field. This should be removed, but that changes the on-disk format,
     /// so we need to make sure we're backwards-` (and maybe forwards-) compatible
     /// First pass is to move it to Optional and the next would be its removal
-    missing_layers: Option<HashSet<String>>,
+    missing_layers: Option<HashSet<L>>,
 
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    #[serde(default)]
-    pub layer_metadata: HashMap<String, IndexLayerMetadata>,
+    #[serde(default = "HashMap::default")]
+    pub layer_metadata: HashMap<L, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated here for convenience.
@@ -91,6 +94,104 @@ pub struct IndexPart {
     metadata_bytes: Vec<u8>,
 }
 
+// TODO seems like another part of the remote storage file format
+// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
+pub type IndexPart = IndexPartImpl<LayerFileName>;
+
+pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+pub enum UncleanLayerFileName {
+    Clean(LayerFileName),
+    BackupFile(String),
+}
+
+impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        deserializer.deserialize_string(UncleanLayerFileNameVisitor)
+    }
+}
+
+struct UncleanLayerFileNameVisitor;
+
+impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
+    type Value = UncleanLayerFileName;
+
+    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            formatter,
+            "a string that is a valid LayerFileName or '.old' backup file name"
+        )
+    }
+
+    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+    where
+        E: serde::de::Error,
+    {
+        let maybe_clean: Result<LayerFileName, _> = v.parse();
+        match maybe_clean {
+            Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
+            Err(e) => {
+                if v.ends_with(".old") {
+                    Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
+                } else {
+                    Err(E::custom(e))
+                }
+            }
+        }
+    }
+}
+
+impl UncleanLayerFileName {
+    fn into_clean(self) -> Option<LayerFileName> {
+        match self {
+            UncleanLayerFileName::Clean(clean) => Some(clean),
+            UncleanLayerFileName::BackupFile(_) => None,
+        }
+    }
+}
+
+impl IndexPartUnclean {
+    pub fn remove_unclean_layer_file_names(self) -> IndexPart {
+        let IndexPartUnclean {
+            version,
+            timeline_layers,
+            // this is an unused field, ignore it on cleaning
+            missing_layers: _,
+            layer_metadata,
+            disk_consistent_lsn,
+            metadata_bytes,
+        } = self;
+
+        IndexPart {
+            version,
+            timeline_layers: timeline_layers
+                .into_iter()
+                .filter_map(|unclean_file_name| match unclean_file_name {
+                    UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
+                    UncleanLayerFileName::BackupFile(backup_file_name) => {
+                        // For details see https://github.com/neondatabase/neon/issues/3024
+                        warn!(
+                            "got backup file on the remote storage, ignoring it {backup_file_name}"
+                        );
+                        None
+                    }
+                })
+                .collect(),
+            missing_layers: None,
+            layer_metadata: layer_metadata
+                .into_iter()
+                .filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
+                .collect(),
+            disk_consistent_lsn,
+            metadata_bytes,
+        }
+    }
+}
+
 impl IndexPart {
     /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
     /// used to understand later versions.
@@ -100,23 +201,17 @@ impl IndexPart {
     pub const FILE_NAME: &'static str = "index_part.json";
 
     pub fn new(
-        layers_and_metadata: HashMap<RemotePath, LayerFileMetadata>,
+        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata_bytes: Vec<u8>,
     ) -> Self {
         let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
         let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
 
-        for (remote_path, metadata) in &layers_and_metadata {
+        for (remote_name, metadata) in &layers_and_metadata {
+            timeline_layers.insert(remote_name.to_owned());
             let metadata = IndexLayerMetadata::from(metadata);
-            match remote_path.object_name() {
-                Some(layer_name) => {
-                    timeline_layers.insert(layer_name.to_owned());
-                    layer_metadata.insert(layer_name.to_owned(), metadata);
-                }
-                // TODO move this on a type level: we know, that every layer entry does have a name
-                None => panic!("Layer {remote_path:?} has no file name, skipping"),
-            }
+            layer_metadata.insert(remote_name.to_owned(), metadata);
         }
 
         Self {
@@ -156,21 +251,22 @@ mod tests {
     fn v0_indexpart_is_parsed() {
         let example = r#"{
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["not_a_real_layer_but_adding_coverage"],
+            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "disk_consistent_lsn":"0/16960E8",
             "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
         }"#;
 
         let expected = IndexPart {
             version: 0,
-            timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
-            missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            missing_layers: None, // disabled fields should not carry unused values further
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
         };
 
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
+        let part = part.remove_unclean_layer_file_names();
         assert_eq!(part, expected);
     }
 
@@ -179,10 +275,10 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["not_a_real_layer_but_adding_coverage"],
+            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
+                "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
             },
             "disk_consistent_lsn":"0/16960E8",
             "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -191,13 +287,13 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
-            missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            missing_layers: None,
             layer_metadata: HashMap::from([
-                (String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                     file_size: Some(25600000),
                 }),
-                (String::from("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
+                (LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
                     // serde_json should always parse this but this might be a double with jq for
                     // example.
                     file_size: Some(9007199254741001),
@@ -207,7 +303,9 @@ mod tests {
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
         };
 
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = serde_json::from_str::<IndexPartUnclean>(example)
+            .unwrap()
+            .remove_unclean_layer_file_names();
         assert_eq!(part, expected);
     }
 
@@ -218,7 +316,7 @@ mod tests {
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
+                "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
             },
             "disk_consistent_lsn":"0/16960E8",
             "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -227,29 +325,24 @@ mod tests {
         let expected = IndexPart {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string()]),
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
             layer_metadata: HashMap::from([
-                (
-                    "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string(),
-                    IndexLayerMetadata {
-                        file_size: Some(25600000),
-                    }
-                ),
-                (
-                    "not_a_real_layer_but_adding_coverage".to_string(),
-                    IndexLayerMetadata {
-                        // serde_json should always parse this but this might be a double with jq for
-                        // example.
-                        file_size: Some(9007199254741001),
-                    }
-                )
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: Some(25600000),
+                }),
+                (LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: Some(9007199254741001),
+                })
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
             missing_layers: None,
         };
 
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
+        let part = part.remove_unclean_layer_file_names();
         assert_eq!(part, expected);
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a7601ba2a7..80b65d281f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,6 +57,7 @@ use crate::storage_sync::RemoteTimelineClient;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::storage_layer::Layer;
 use crate::tenant_config::TenantConfOpt;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -89,8 +90,6 @@ mod timeline;
 
 pub mod size;
 
-use storage_layer::Layer;
-
 pub use timeline::Timeline;
 
 // re-export this function so that page_cache.rs can use it.
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index ab36754c9e..e3cc800447 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -60,7 +60,7 @@ where
 ///
 /// ```no_run
 /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
-/// # let reader: FileBlockReader<std::fs::File> = todo!();
+/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
 /// let cursor = reader.block_cursor();
 /// let buf = cursor.read_blk(1);
 /// // do stuff with 'buf'
diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index dcd6956640..d8aaa3e8b9 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -30,7 +30,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::filename::{DeltaFileName, PathOrConf};
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_layer::{
+    PersistentLayer, ValueReconstructResult, ValueReconstructState,
+};
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -52,6 +54,9 @@ use utils::{
     lsn::Lsn,
 };
 
+use super::filename::LayerFileName;
+use super::storage_layer::Layer;
+
 ///
 /// Header stored in the beginning of the file
 ///
@@ -194,14 +199,6 @@ pub struct DeltaLayerInner {
 }
 
 impl Layer for DeltaLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
-
     fn get_key_range(&self) -> Range<Key> {
         self.key_range.clone()
     }
@@ -209,13 +206,86 @@ impl Layer for DeltaLayer {
     fn get_lsn_range(&self) -> Range<Lsn> {
         self.lsn_range.clone()
     }
-
-    fn filename(&self) -> PathBuf {
-        PathBuf::from(self.layer_name().to_string())
+    fn is_incremental(&self) -> bool {
+        true
     }
 
-    fn local_path(&self) -> Option<PathBuf> {
-        Some(self.path())
+    fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+    /// debugging function to print out the contents of the layer
+    fn dump(&self, verbose: bool) -> Result<()> {
+        println!(
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load()?;
+
+        println!(
+            "index_start_blk: {}, root {}",
+            inner.index_start_blk, inner.index_root_blk
+        );
+
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump()?;
+
+        let mut cursor = file.block_cursor();
+
+        // A subroutine to dump a single blob
+        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+            let buf = cursor.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        };
+
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |delta_key, val| {
+                let blob_ref = BlobRef(val);
+                let key = DeltaKey::extract_key_from_buf(delta_key);
+                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
+
+                let desc = match dump_blob(blob_ref) {
+                    Ok(desc) => desc,
+                    Err(err) => format!("ERROR: {}", err),
+                };
+                println!("  key {} at {}: {}", key, lsn, desc);
+                true
+            },
+        )?;
+
+        Ok(())
     }
 
     fn get_value_reconstruct_data(
@@ -302,6 +372,24 @@ impl Layer for DeltaLayer {
             Ok(ValueReconstructResult::Complete)
         }
     }
+}
+
+impl PersistentLayer for DeltaLayer {
+    fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
+
+    fn get_timeline_id(&self) -> TimelineId {
+        self.timeline_id
+    }
+
+    fn filename(&self) -> LayerFileName {
+        self.layer_name().into()
+    }
+
+    fn local_path(&self) -> PathBuf {
+        self.path()
+    }
 
     fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
         let inner = match self.load() {
@@ -332,89 +420,6 @@ impl Layer for DeltaLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
-
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn is_in_memory(&self) -> bool {
-        false
-    }
-
-    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool) -> Result<()> {
-        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load()?;
-
-        println!(
-            "index_start_blk: {}, root {}",
-            inner.index_start_blk, inner.index_root_blk
-        );
-
-        let file = inner.file.as_ref().unwrap();
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump()?;
-
-        let mut cursor = file.block_cursor();
-
-        // A subroutine to dump a single blob
-        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            let buf = cursor.read_blob(blob_ref.pos())?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        };
-
-        tree_reader.visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |delta_key, val| {
-                let blob_ref = BlobRef(val);
-                let key = DeltaKey::extract_key_from_buf(delta_key);
-                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
-
-                let desc = match dump_blob(blob_ref) {
-                    Ok(desc) => desc,
-                    Err(err) => format!("ERROR: {}", err),
-                };
-                println!("  key {} at {}: {}", key, lsn, desc);
-                true
-            },
-        )?;
-
-        Ok(())
-    }
 }
 
 impl DeltaLayer {
@@ -511,8 +516,8 @@ impl DeltaLayer {
                 }
             }
             PathOrConf::Path(path) => {
-                let actual_filename = Path::new(path.file_name().unwrap());
-                let expected_filename = self.filename();
+                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+                let expected_filename = self.filename().file_name();
 
                 if actual_filename != expected_filename {
                     println!(
diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/filename.rs
index 0ebf2d479b..6ecf9227c7 100644
--- a/pageserver/src/tenant/filename.rs
+++ b/pageserver/src/tenant/filename.rs
@@ -7,11 +7,12 @@ use std::cmp::Ordering;
 use std::fmt;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::str::FromStr;
 
 use utils::lsn::Lsn;
 
 // Note: Timeline::load_layer_map() relies on this sort order
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
 pub struct DeltaFileName {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
@@ -101,7 +102,7 @@ impl fmt::Display for DeltaFileName {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
 pub struct ImageFileName {
     pub key_range: Range<Key>,
     pub lsn: Lsn,
@@ -172,6 +173,103 @@ impl fmt::Display for ImageFileName {
         )
     }
 }
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+pub enum LayerFileName {
+    Image(ImageFileName),
+    Delta(DeltaFileName),
+    #[cfg(test)]
+    Test(String),
+}
+
+impl LayerFileName {
+    pub fn file_name(&self) -> String {
+        match self {
+            LayerFileName::Image(fname) => format!("{fname}"),
+            LayerFileName::Delta(fname) => format!("{fname}"),
+            #[cfg(test)]
+            LayerFileName::Test(fname) => fname.to_string(),
+        }
+    }
+    #[cfg(test)]
+    pub(crate) fn new_test(name: &str) -> LayerFileName {
+        LayerFileName::Test(name.to_owned())
+    }
+}
+
+impl From<ImageFileName> for LayerFileName {
+    fn from(fname: ImageFileName) -> Self {
+        LayerFileName::Image(fname)
+    }
+}
+impl From<DeltaFileName> for LayerFileName {
+    fn from(fname: DeltaFileName) -> Self {
+        LayerFileName::Delta(fname)
+    }
+}
+
+// include a `/` in the name as an additional layer of robustness
+// because `/` chars are not allowed in UNIX paths
+#[cfg(test)]
+const LAYER_FILE_NAME_TEST_PREFIX: &str = "LAYER_FILE_NAME::test/";
+
+impl FromStr for LayerFileName {
+    type Err = String;
+
+    fn from_str(value: &str) -> Result<Self, Self::Err> {
+        #[cfg(test)]
+        if let Some(value) = value.strip_prefix(LAYER_FILE_NAME_TEST_PREFIX) {
+            return Ok(LayerFileName::Test(value.to_owned()));
+        }
+        let delta = DeltaFileName::parse_str(value);
+        let image = ImageFileName::parse_str(value);
+        let ok = match (delta, image) {
+            (None, None) => {
+                return Err(format!(
+                    "neither delta nor image layer file name: {value:?}"
+                ))
+            }
+            (Some(delta), None) => LayerFileName::Delta(delta),
+            (None, Some(image)) => LayerFileName::Image(image),
+            (Some(_), Some(_)) => unreachable!(),
+        };
+        Ok(ok)
+    }
+}
+
+impl serde::Serialize for LayerFileName {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        match self {
+            LayerFileName::Image(fname) => serializer.serialize_str(&format!("{}", fname)),
+            LayerFileName::Delta(fname) => serializer.serialize_str(&format!("{}", fname)),
+            #[cfg(test)]
+            LayerFileName::Test(t) => {
+                serializer.serialize_str(&format!("{LAYER_FILE_NAME_TEST_PREFIX}{t}"))
+            }
+        }
+    }
+}
+
+struct LayerFileNameVisitor;
+
+impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
+    type Value = LayerFileName;
+
+    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            formatter,
+            "a string that is a valid image or delta layer file name"
+        )
+    }
+    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+    where
+        E: serde::de::Error,
+    {
+        v.parse().map_err(|e| E::custom(e))
+    }
+}
 
 /// Helper enum to hold a PageServerConf, or a path
 ///
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index 8409d34bc9..e08e938a4f 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -26,7 +26,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::filename::{ImageFileName, PathOrConf};
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_layer::{
+    PersistentLayer, ValueReconstructResult, ValueReconstructState,
+};
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
@@ -48,6 +50,9 @@ use utils::{
     lsn::Lsn,
 };
 
+use super::filename::LayerFileName;
+use super::storage_layer::Layer;
+
 ///
 /// Header stored in the beginning of the file
 ///
@@ -120,22 +125,6 @@ pub struct ImageLayerInner {
 }
 
 impl Layer for ImageLayer {
-    fn filename(&self) -> PathBuf {
-        PathBuf::from(self.layer_name().to_string())
-    }
-
-    fn local_path(&self) -> Option<PathBuf> {
-        Some(self.path())
-    }
-
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
-
     fn get_key_range(&self) -> Range<Key> {
         self.key_range.clone()
     }
@@ -144,58 +133,12 @@ impl Layer for ImageLayer {
         // End-bound is exclusive
         self.lsn..(self.lsn + 1)
     }
-
-    /// Look up given page in the file
-    fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.key_range.contains(&key));
-        assert!(lsn_range.start >= self.lsn);
-        assert!(lsn_range.end >= self.lsn);
-
-        let inner = self.load()?;
-
-        let file = inner.file.as_ref().unwrap();
-        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf)? {
-            let blob = file.block_cursor().read_blob(offset).with_context(|| {
-                format!(
-                    "failed to read value from data file {} at offset {}",
-                    self.filename().display(),
-                    offset
-                )
-            })?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
-        todo!();
-    }
-
-    fn delete(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
     fn is_incremental(&self) -> bool {
         false
     }
 
-    fn is_in_memory(&self) -> bool {
-        false
+    fn short_id(&self) -> String {
+        self.filename().file_name()
     }
 
     /// debugging function to print out the contents of the layer
@@ -223,6 +166,68 @@ impl Layer for ImageLayer {
 
         Ok(())
     }
+
+    /// Look up given page in the file
+    fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        assert!(self.key_range.contains(&key));
+        assert!(lsn_range.start >= self.lsn);
+        assert!(lsn_range.end >= self.lsn);
+
+        let inner = self.load()?;
+
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf)? {
+            let blob = file.block_cursor().read_blob(offset).with_context(|| {
+                format!(
+                    "failed to read value from data file {} at offset {}",
+                    self.path().display(),
+                    offset
+                )
+            })?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
+    }
+}
+
+impl PersistentLayer for ImageLayer {
+    fn filename(&self) -> LayerFileName {
+        self.layer_name().into()
+    }
+
+    fn local_path(&self) -> PathBuf {
+        self.path()
+    }
+
+    fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
+
+    fn get_timeline_id(&self) -> TimelineId {
+        self.timeline_id
+    }
+    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+        unimplemented!();
+    }
+
+    fn delete(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
 }
 
 impl ImageLayer {
@@ -314,8 +319,8 @@ impl ImageLayer {
                 }
             }
             PathOrConf::Path(path) => {
-                let actual_filename = Path::new(path.file_name().unwrap());
-                let expected_filename = self.filename();
+                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+                let expected_filename = self.filename().file_name();
 
                 if actual_filename != expected_filename {
                     println!(
diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs
index 9aa33a72ca..8f64281cb1 100644
--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/inmemory_layer.rs
@@ -10,9 +10,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
-use anyhow::{bail, ensure, Result};
+use anyhow::{ensure, Result};
 use std::cell::RefCell;
 use std::collections::HashMap;
 use tracing::*;
@@ -26,9 +26,10 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::path::PathBuf;
 use std::sync::RwLock;
 
+use super::storage_layer::Layer;
+
 thread_local! {
     /// A buffer for serializing object during [`InMemoryLayer::put_value`].
     /// This buffer is reused for each serialization to avoid additional malloc calls.
@@ -75,33 +76,13 @@ impl InMemoryLayerInner {
     }
 }
 
-impl Layer for InMemoryLayer {
-    // An in-memory layer can be spilled to disk into ephemeral file,
-    // This function is used only for debugging, so we don't need to be very precise.
-    // Construct a filename as if it was a delta layer.
-    fn filename(&self) -> PathBuf {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
-
-        PathBuf::from(format!(
-            "inmem-{:016X}-{:016X}",
-            self.start_lsn.0, end_lsn.0
-        ))
-    }
-
-    fn local_path(&self) -> Option<PathBuf> {
-        None
-    }
-
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
+impl InMemoryLayer {
+    pub fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
+}
 
+impl Layer for InMemoryLayer {
     fn get_key_range(&self) -> Range<Key> {
         Key::MIN..Key::MAX
     }
@@ -116,73 +97,16 @@ impl Layer for InMemoryLayer {
         };
         self.start_lsn..end_lsn
     }
-
-    /// Look up given value in the layer.
-    fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.start_lsn);
-        let mut need_image = true;
-
-        let inner = self.inner.read().unwrap();
-
-        let mut reader = inner.file.block_cursor();
-
-        // Scan the page versions backwards, starting from `lsn`.
-        if let Some(vec_map) = inner.index.get(&key) {
-            let slice = vec_map.slice_range(lsn_range);
-            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
-                let value = Value::des(&buf)?;
-                match value {
-                    Value::Image(img) => {
-                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok(ValueReconstructResult::Complete);
-                    }
-                    Value::WalRecord(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_state.records.push((*entry_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // release lock on 'inner'
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
-        todo!();
-    }
-
-    /// Nothing to do here. When you drop the last reference to the layer, it will
-    /// be deallocated.
-    fn delete(&self) -> Result<()> {
-        bail!("can't delete an InMemoryLayer")
-    }
-
     fn is_incremental(&self) -> bool {
         // in-memory layer is always considered incremental.
         true
     }
 
-    fn is_in_memory(&self) -> bool {
-        true
+    fn short_id(&self) -> String {
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
     }
 
     /// debugging function to print out the contents of the layer
@@ -235,6 +159,55 @@ impl Layer for InMemoryLayer {
 
         Ok(())
     }
+
+    /// Look up given value in the layer.
+    fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.start_lsn);
+        let mut need_image = true;
+
+        let inner = self.inner.read().unwrap();
+
+        let mut reader = inner.file.block_cursor();
+
+        // Scan the page versions backwards, starting from `lsn`.
+        if let Some(vec_map) = inner.index.get(&key) {
+            let slice = vec_map.slice_range(lsn_range);
+            for (entry_lsn, pos) in slice.iter().rev() {
+                let buf = reader.read_blob(*pos)?;
+                let value = Value::des(&buf)?;
+                match value {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((*entry_lsn, img));
+                        return Ok(ValueReconstructResult::Complete);
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((*entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        // release lock on 'inner'
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
 }
 
 impl InMemoryLayer {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 9d914c1839..19252ecf6e 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -13,7 +13,6 @@
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
 use crate::tenant::inmemory_layer::InMemoryLayer;
-use crate::tenant::storage_layer::Layer;
 use crate::tenant::storage_layer::{range_eq, range_overlaps};
 use amplify_num::i256;
 use anyhow::Result;
@@ -28,11 +27,12 @@ use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;
 
+use super::storage_layer::Layer;
+
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-#[derive(Default)]
-pub struct LayerMap {
+pub struct LayerMap<L: ?Sized> {
     //
     // 'open_layer' holds the current InMemoryLayer that is accepting new
     // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -53,15 +53,27 @@ pub struct LayerMap {
     pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
 
     /// All the historic layers are kept here
-    historic_layers: RTree<LayerRTreeObject>,
+    historic_layers: RTree<LayerRTreeObject<L>>,
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
-    l0_delta_layers: Vec<Arc<dyn Layer>>,
+    l0_delta_layers: Vec<Arc<L>>,
 }
 
-struct LayerRTreeObject {
-    layer: Arc<dyn Layer>,
+impl<L: ?Sized> Default for LayerMap<L> {
+    fn default() -> Self {
+        Self {
+            open_layer: None,
+            next_open_layer_at: None,
+            frozen_layers: VecDeque::default(),
+            historic_layers: RTree::default(),
+            l0_delta_layers: Vec::default(),
+        }
+    }
+}
+
+struct LayerRTreeObject<L: ?Sized> {
+    layer: Arc<L>,
 
     envelope: AABB<[IntKey; 2]>,
 }
@@ -185,7 +197,7 @@ impl Num for IntKey {
     }
 }
 
-impl PartialEq for LayerRTreeObject {
+impl<T: ?Sized> PartialEq for LayerRTreeObject<T> {
     fn eq(&self, other: &Self) -> bool {
         // FIXME: ptr_eq might fail to return true for 'dyn'
         // references.  Clippy complains about this. In practice it
@@ -196,15 +208,21 @@ impl PartialEq for LayerRTreeObject {
     }
 }
 
-impl RTreeObject for LayerRTreeObject {
+impl<L> RTreeObject for LayerRTreeObject<L>
+where
+    L: ?Sized,
+{
     type Envelope = AABB<[IntKey; 2]>;
     fn envelope(&self) -> Self::Envelope {
         self.envelope
     }
 }
 
-impl LayerRTreeObject {
-    fn new(layer: Arc<dyn Layer>) -> Self {
+impl<L> LayerRTreeObject<L>
+where
+    L: ?Sized + Layer,
+{
+    fn new(layer: Arc<L>) -> Self {
         let key_range = layer.get_key_range();
         let lsn_range = layer.get_lsn_range();
 
@@ -223,12 +241,15 @@ impl LayerRTreeObject {
 }
 
 /// Return value of LayerMap::search
-pub struct SearchResult {
-    pub layer: Arc<dyn Layer>,
+pub struct SearchResult<L: ?Sized> {
+    pub layer: Arc<L>,
     pub lsn_floor: Lsn,
 }
 
-impl LayerMap {
+impl<L> LayerMap<L>
+where
+    L: ?Sized + Layer,
+{
     ///
     /// Find the latest layer that covers the given 'key', with lsn <
     /// 'end_lsn'.
@@ -240,10 +261,10 @@ impl LayerMap {
     /// contain the version, even if it's missing from the returned
     /// layer.
     ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
         // linear search
         // Find the latest image layer that covers the given key
-        let mut latest_img: Option<Arc<dyn Layer>> = None;
+        let mut latest_img: Option<Arc<L>> = None;
         let mut latest_img_lsn: Option<Lsn> = None;
         let envelope = AABB::from_corners(
             [IntKey::from(key.to_i128()), IntKey::from(0i128)],
@@ -277,7 +298,7 @@ impl LayerMap {
         }
 
         // Search the delta layers
-        let mut latest_delta: Option<Arc<dyn Layer>> = None;
+        let mut latest_delta: Option<Arc<L>> = None;
         for e in self
             .historic_layers
             .locate_in_envelope_intersecting(&envelope)
@@ -301,7 +322,7 @@ impl LayerMap {
                 // No need to search any further
                 trace!(
                     "found layer {} for request on {key} at {end_lsn}",
-                    l.filename().display(),
+                    l.short_id(),
                 );
                 latest_delta.replace(Arc::clone(l));
                 break;
@@ -319,7 +340,7 @@ impl LayerMap {
         if let Some(l) = latest_delta {
             trace!(
                 "found (old) layer {} for request on {key} at {end_lsn}",
-                l.filename().display(),
+                l.short_id(),
             );
             let lsn_floor = std::cmp::max(
                 Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
@@ -344,7 +365,7 @@ impl LayerMap {
     ///
     /// Insert an on-disk layer
     ///
-    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
         if layer.get_key_range() == (Key::MIN..Key::MAX) {
             self.l0_delta_layers.push(layer.clone());
         }
@@ -357,7 +378,7 @@ impl LayerMap {
     ///
     /// This should be called when the corresponding file on disk has been deleted.
     ///
-    pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
+    pub fn remove_historic(&mut self, layer: Arc<L>) {
         if layer.get_key_range() == (Key::MIN..Key::MAX) {
             let len_before = self.l0_delta_layers.len();
 
@@ -426,13 +447,13 @@ impl LayerMap {
         }
     }
 
-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
         self.historic_layers.iter().map(|e| e.layer.clone())
     }
 
     /// Find the last image layer that covers 'key', ignoring any image layers
     /// newer than 'lsn'.
-    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
+    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<L>> {
         let mut candidate_lsn = Lsn(0);
         let mut candidate = None;
         let envelope = AABB::from_corners(
@@ -474,7 +495,7 @@ impl LayerMap {
         &self,
         key_range: &Range<Key>,
         lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
         let mut points = vec![key_range.start];
         let envelope = AABB::from_corners(
             [IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
@@ -559,7 +580,7 @@ impl LayerMap {
     }
 
     /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
         Ok(self.l0_delta_layers.clone())
     }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 8dafcab124..3ad62587d3 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -14,6 +14,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use super::filename::LayerFileName;
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -69,26 +70,9 @@ pub enum ValueReconstructResult {
     Missing,
 }
 
-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN
-///
+/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
+/// required by [`LayerMap`].
 pub trait Layer: Send + Sync {
-    fn get_tenant_id(&self) -> TenantId;
-
-    /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId;
-
     /// Range of keys that this layer covers
     fn get_key_range(&self) -> Range<Key>;
 
@@ -100,13 +84,11 @@ pub trait Layer: Send + Sync {
     /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
     fn get_lsn_range(&self) -> Range<Lsn>;
 
-    /// Filename used to store this layer on disk. (Even in-memory layers
-    /// implement this, to print a handy unique identifier for the layer for
-    /// log messages, even though they're never not on disk.)
-    fn filename(&self) -> PathBuf;
-
-    /// If a layer has a corresponding file on a local filesystem, return its absolute path.
-    fn local_path(&self) -> Option<PathBuf>;
+    /// Does this layer only contain some data for the key-range (incremental),
+    /// or does it contain a version of every page? This is important to know
+    /// for garbage collecting old layers: an incremental layer depends on
+    /// the previous non-incremental layer.
+    fn is_incremental(&self) -> bool;
 
     ///
     /// Return data needed to reconstruct given page at LSN.
@@ -127,14 +109,39 @@ pub trait Layer: Send + Sync {
         reconstruct_data: &mut ValueReconstructState,
     ) -> Result<ValueReconstructResult>;
 
-    /// Does this layer only contain some data for the key-range (incremental),
-    /// or does it contain a version of every page? This is important to know
-    /// for garbage collecting old layers: an incremental layer depends on
-    /// the previous non-incremental layer.
-    fn is_incremental(&self) -> bool;
+    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
+    fn short_id(&self) -> String;
 
-    /// Returns true for layers that are represented in memory.
-    fn is_in_memory(&self) -> bool;
+    /// Dump summary of the contents of the layer to stdout
+    fn dump(&self, verbose: bool) -> Result<()>;
+}
+
+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
+///
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN
+///
+pub trait PersistentLayer: Layer {
+    fn get_tenant_id(&self) -> TenantId;
+
+    /// Identify the timeline this layer belongs to
+    fn get_timeline_id(&self) -> TimelineId;
+
+    /// File name used for this layer, both in the pageserver's local filesystem
+    /// state as well as in the remote storage.
+    fn filename(&self) -> LayerFileName;
+
+    // Path to the layer file in the local filesystem.
+    fn local_path(&self) -> PathBuf;
 
     /// Iterate through all keys and values stored in the layer
     fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
@@ -147,7 +154,4 @@ pub trait Layer: Send + Sync {
 
     /// Permanently remove this layer from disk.
     fn delete(&self) -> Result<()>;
-
-    /// Dump summary of the contents of the layer to stdout
-    fn dump(&self, verbose: bool) -> Result<()>;
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ed60530f83..9a4194d916 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -11,7 +11,7 @@ use tokio::task::spawn_blocking;
 use tracing::*;
 
 use std::cmp::{max, min, Ordering};
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
@@ -30,7 +30,7 @@ use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::{save_metadata, TimelineMetadata},
     par_fsync,
-    storage_layer::{Layer, ValueReconstructResult, ValueReconstructState},
+    storage_layer::{PersistentLayer, ValueReconstructResult, ValueReconstructState},
 };
 
 use crate::config::PageServerConf;
@@ -62,6 +62,9 @@ use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
 use crate::{page_cache, storage_sync::index::LayerFileMetadata};
 
+use super::filename::LayerFileName;
+use super::storage_layer::Layer;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum FlushLoopState {
     NotStarted,
@@ -78,7 +81,7 @@ pub struct Timeline {
 
     pub pg_version: u32,
 
-    pub layers: RwLock<LayerMap>,
+    pub layers: RwLock<LayerMap<dyn PersistentLayer>>,
 
     last_freeze_at: AtomicLsn,
     // Atomic would be more appropriate here.
@@ -927,7 +930,7 @@ impl Timeline {
                 let layer =
                     ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
 
-                trace!("found layer {}", layer.filename().display());
+                trace!("found layer {}", layer.path().display());
                 total_physical_size += layer.path().metadata()?.len();
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
@@ -951,7 +954,7 @@ impl Timeline {
                 let layer =
                     DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
 
-                trace!("found layer {}", layer.filename().display());
+                trace!("found layer {}", layer.path().display());
                 total_physical_size += layer.path().metadata()?.len();
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
@@ -998,9 +1001,9 @@ impl Timeline {
         &self,
         index_part: &IndexPart,
         remote_client: &RemoteTimelineClient,
-        local_layers: HashSet<PathBuf>,
+        local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
         up_to_date_disk_consistent_lsn: Lsn,
-    ) -> anyhow::Result<HashSet<PathBuf>> {
+    ) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
         // Are we missing some files that are present in remote storage?
         // Download them now.
         // TODO Downloading many files this way is not efficient.
@@ -1012,8 +1015,7 @@ impl Timeline {
         let mut local_only_layers = local_layers;
         let timeline_dir = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for remote_layer_name in &index_part.timeline_layers {
-            let local_layer_path = timeline_dir.join(remote_layer_name);
-            local_only_layers.remove(&local_layer_path);
+            local_only_layers.remove(remote_layer_name);
 
             let remote_layer_metadata = index_part
                 .layer_metadata
@@ -1021,10 +1023,7 @@ impl Timeline {
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
-            let remote_layer_path = self
-                .conf
-                .remote_path(&local_layer_path)
-                .expect("local_layer_path received from the same conf that provided a workdir");
+            let local_layer_path = timeline_dir.join(remote_layer_name.file_name());
 
             if local_layer_path.exists() {
                 let mut already_downloaded = true;
@@ -1056,83 +1055,74 @@ impl Timeline {
                     continue;
                 }
             } else {
-                info!("remote layer {remote_layer_path:?} does not exist locally");
+                info!("remote layer {remote_layer_name:?} does not exist locally");
             }
 
-            let layer_name = local_layer_path
-                .file_name()
-                .and_then(|os_str| os_str.to_str())
-                .with_context(|| {
-                    format!("Layer file {local_layer_path:?} has no name in unicode")
-                })?;
-            if let Some(imgfilename) = ImageFileName::parse_str(layer_name) {
-                if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                    warn!(
+            match remote_layer_name {
+                LayerFileName::Image(imgfilename) => {
+                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
+                        warn!(
                         "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                         imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                     );
-                    continue;
+                        continue;
+                    }
+
+                    trace!("downloading image file: {remote_layer_name:?}");
+                    let downloaded_size = remote_client
+                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
+                        .await
+                        .with_context(|| {
+                            format!("failed to download image layer {remote_layer_name:?}")
+                        })?;
+                    trace!("done");
+
+                    let image_layer =
+                        ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename);
+
+                    self.layers
+                        .write()
+                        .unwrap()
+                        .insert_historic(Arc::new(image_layer));
+                    self.metrics
+                        .current_physical_size_gauge
+                        .add(downloaded_size);
                 }
-
-                trace!("downloading image file: {remote_layer_path:?}");
-                let downloaded_size = remote_client
-                    .download_layer_file(&remote_layer_path, &remote_layer_metadata)
-                    .await
-                    .with_context(|| {
-                        format!("failed to download image layer from path {remote_layer_path:?}")
-                    })?;
-                trace!("done");
-
-                let image_layer =
-                    ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
-
-                self.layers
-                    .write()
-                    .unwrap()
-                    .insert_historic(Arc::new(image_layer));
-                self.metrics
-                    .current_physical_size_gauge
-                    .add(downloaded_size);
-            } else if let Some(deltafilename) = DeltaFileName::parse_str(layer_name) {
-                // Create a DeltaLayer struct for each delta file.
-                // The end-LSN is exclusive, while disk_consistent_lsn is
-                // inclusive. For example, if disk_consistent_lsn is 100, it is
-                // OK for a delta layer to have end LSN 101, but if the end LSN
-                // is 102, then it might not have been fully flushed to disk
-                // before crash.
-                if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                    warn!(
+                LayerFileName::Delta(deltafilename) => {
+                    // Create a DeltaLayer struct for each delta file.
+                    // The end-LSN is exclusive, while disk_consistent_lsn is
+                    // inclusive. For example, if disk_consistent_lsn is 100, it is
+                    // OK for a delta layer to have end LSN 101, but if the end LSN
+                    // is 102, then it might not have been fully flushed to disk
+                    // before crash.
+                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
+                        warn!(
                         "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                         deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                     );
-                    continue;
+                        continue;
+                    }
+
+                    trace!("downloading delta file: {remote_layer_name:?}");
+                    let sz = remote_client
+                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
+                        .await
+                        .with_context(|| {
+                            format!("failed to download delta layer {remote_layer_name:?}")
+                        })?;
+                    trace!("done");
+
+                    let delta_layer =
+                        DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename);
+
+                    self.layers
+                        .write()
+                        .unwrap()
+                        .insert_historic(Arc::new(delta_layer));
+                    self.metrics.current_physical_size_gauge.add(sz);
                 }
-
-                trace!("downloading delta file: {remote_layer_path:?}");
-                let sz = remote_client
-                    .download_layer_file(&remote_layer_path, &remote_layer_metadata)
-                    .await
-                    .with_context(|| {
-                        format!("failed to download delta layer from path {remote_layer_path:?}")
-                    })?;
-                trace!("done");
-
-                let delta_layer =
-                    DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
-
-                self.layers
-                    .write()
-                    .unwrap()
-                    .insert_historic(Arc::new(delta_layer));
-                self.metrics.current_physical_size_gauge.add(sz);
-            } else if layer_name.ends_with(".old") {
-                // For details see https://github.com/neondatabase/neon/issues/3024
-                warn!(
-                    "got backup file on the remote storage, ignoring it {file}",
-                    file = layer_name
-                )
-            } else {
-                bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}");
+                #[cfg(test)]
+                LayerFileName::Test(_) => unreachable!(),
             }
         }
 
@@ -1169,18 +1159,13 @@ impl Timeline {
 
         let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
 
-        // Build a map of local layers for quick lookups
         let local_layers = self
             .layers
             .read()
             .unwrap()
             .iter_historic_layers()
-            .map(|historic_layer| {
-                historic_layer
-                    .local_path()
-                    .expect("Historic layers should have a path")
-            })
-            .collect::<HashSet<_>>();
+            .map(|l| (l.filename(), l))
+            .collect::<HashMap<_, _>>();
 
         let local_only_layers = match index_part {
             Some(index_part) => {
@@ -1189,6 +1174,7 @@ impl Timeline {
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
+
                 self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
                     .await?
             }
@@ -1200,14 +1186,15 @@ impl Timeline {
         };
 
         // Are there local files that don't exist remotely? Schedule uploads for them
-        for layer_path in &local_only_layers {
+        for (layer_name, layer) in &local_only_layers {
+            let layer_path = layer.local_path();
             let layer_size = layer_path
                 .metadata()
                 .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
                 .len();
             info!("scheduling {layer_path:?} for upload");
             remote_client
-                .schedule_layer_file_upload(layer_path, &LayerFileMetadata::new(layer_size))?;
+                .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
         }
         if !local_only_layers.is_empty() {
             remote_client.schedule_index_upload(up_to_date_metadata)?;
@@ -1322,7 +1309,36 @@ impl Timeline {
             Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
         }
     }
+}
 
+type TraversalId = String;
+
+trait TraversalLayerExt {
+    fn traversal_id(&self) -> TraversalId;
+}
+
+impl TraversalLayerExt for Arc<dyn PersistentLayer> {
+    fn traversal_id(&self) -> String {
+        debug_assert!(
+            self.local_path().to_str().unwrap()
+                .contains(&format!("{}", self.get_timeline_id())),
+            "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
+        );
+        format!("{}", self.local_path().display())
+    }
+}
+
+impl TraversalLayerExt for Arc<InMemoryLayer> {
+    fn traversal_id(&self) -> String {
+        format!(
+            "timeline {} in-memory {}",
+            self.get_timeline_id(),
+            self.short_id()
+        )
+    }
+}
+
+impl Timeline {
     ///
     /// Get a handle to a Layer for reading.
     ///
@@ -1343,7 +1359,7 @@ impl Timeline {
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)> = Vec::new();
+        let mut traversal_path = Vec::<(ValueReconstructResult, Lsn, TraversalId)>::new();
 
         let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
             *cached_lsn
@@ -1425,7 +1441,7 @@ impl Timeline {
                         reconstruct_state,
                     )?;
                     cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, open_layer.clone()));
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue;
                 }
             }
@@ -1440,7 +1456,7 @@ impl Timeline {
                         reconstruct_state,
                     )?;
                     cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, frozen_layer.clone()));
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -1455,7 +1471,7 @@ impl Timeline {
                     reconstruct_state,
                 )?;
                 cont_lsn = lsn_floor;
-                traversal_path.push((result, cont_lsn, layer));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
             } else if timeline.ancestor_timeline.is_some() {
                 // Nothing on this timeline. Traverse to parent
                 result = ValueReconstructResult::Continue;
@@ -1670,7 +1686,7 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
+    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
     async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
@@ -1729,7 +1745,7 @@ impl Timeline {
     fn update_metadata_file(
         &self,
         disk_consistent_lsn: Lsn,
-        layer_paths_to_upload: HashMap<PathBuf, LayerFileMetadata>,
+        layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
     ) -> anyhow::Result<()> {
         // We can only save a valid 'prev_record_lsn' value on disk if we
         // flushed *all* in-memory changes to disk. We only track
@@ -1794,10 +1810,11 @@ impl Timeline {
     fn create_delta_layer(
         &self,
         frozen_layer: &InMemoryLayer,
-    ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> {
+    ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
         // Write it out
         let new_delta = frozen_layer.write_to_disk()?;
         let new_delta_path = new_delta.path();
+        let new_delta_filename = new_delta.filename();
 
         // Sync it to disk.
         //
@@ -1826,7 +1843,7 @@ impl Timeline {
         self.metrics.num_persistent_files_created.inc_by(1);
         self.metrics.persistent_bytes_written.inc_by(sz);
 
-        Ok((new_delta_path, LayerFileMetadata::new(sz)))
+        Ok((new_delta_filename, LayerFileMetadata::new(sz)))
     }
 
     fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
@@ -1888,7 +1905,7 @@ impl Timeline {
         partitioning: &KeyPartitioning,
         lsn: Lsn,
         force: bool,
-    ) -> anyhow::Result<HashMap<PathBuf, LayerFileMetadata>> {
+    ) -> anyhow::Result<HashMap<LayerFileName, LayerFileMetadata>> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers: Vec<ImageLayer> = Vec::new();
         for partition in partitioning.parts.iter() {
@@ -1966,9 +1983,10 @@ impl Timeline {
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
         let mut layers = self.layers.write().unwrap();
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for l in image_layers {
-            let path = l.path();
-            let metadata = path.metadata()?;
+            let path = l.filename();
+            let metadata = timeline_path.join(path.file_name()).metadata()?;
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
@@ -1984,7 +2002,7 @@ impl Timeline {
 #[derive(Default)]
 struct CompactLevel0Phase1Result {
     new_layers: Vec<DeltaLayer>,
-    deltas_to_compact: Vec<Arc<dyn Layer>>,
+    deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
 }
 
 impl Timeline {
@@ -2042,7 +2060,7 @@ impl Timeline {
             level0_deltas.len()
         );
         for l in deltas_to_compact.iter() {
-            info!("compact includes {}", l.filename().display());
+            info!("compact includes {}", l.filename().file_name());
         }
         // We don't need the original list of layers anymore. Drop it so that
         // we don't accidentally use it later in the function.
@@ -2271,7 +2289,7 @@ impl Timeline {
 
             if let Some(remote_client) = &self.remote_client {
                 remote_client.schedule_layer_file_upload(
-                    &new_delta_path,
+                    &l.filename(),
                     &LayerFileMetadata::new(metadata.len()),
                 )?;
             }
@@ -2280,19 +2298,19 @@ impl Timeline {
             self.metrics.current_physical_size_gauge.add(metadata.len());
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
-            layers.insert_historic(Arc::new(l));
+            let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
+            layers.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
         // delete the old ones
-        let mut layer_paths_to_delete = Vec::with_capacity(deltas_to_compact.len());
+        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
-            if let Some(path) = l.local_path() {
-                self.metrics
-                    .current_physical_size_gauge
-                    .sub(path.metadata()?.len());
-                layer_paths_to_delete.push(path);
-            }
+            let path = l.local_path();
+            self.metrics
+                .current_physical_size_gauge
+                .sub(path.metadata()?.len());
+            layer_names_to_delete.push(l.filename());
             l.delete()?;
             layers.remove_historic(l);
         }
@@ -2300,7 +2318,7 @@ impl Timeline {
 
         // Also schedule the deletions in remote storage
         if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_paths_to_delete)?;
+            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
         }
 
         Ok(())
@@ -2485,23 +2503,13 @@ impl Timeline {
         //
         let mut layers = self.layers.write().unwrap();
         'outer: for l in layers.iter_historic_layers() {
-            // This layer is in the process of being flushed to disk.
-            // It will be swapped out of the layer map, replaced with
-            // on-disk layers containing the same data.
-            // We can't GC it, as it's not on disk. We can't remove it
-            // from the layer map yet, as it would make its data
-            // inaccessible.
-            if l.is_in_memory() {
-                continue;
-            }
-
             result.layers_total += 1;
 
             // 1. Is it newer than GC horizon cutoff point?
             if l.get_lsn_range().end > horizon_cutoff {
                 debug!(
                     "keeping {} because it's newer than horizon_cutoff {}",
-                    l.filename().display(),
+                    l.filename().file_name(),
                     horizon_cutoff
                 );
                 result.layers_needed_by_cutoff += 1;
@@ -2512,7 +2520,7 @@ impl Timeline {
             if l.get_lsn_range().end > pitr_cutoff {
                 debug!(
                     "keeping {} because it's newer than pitr_cutoff {}",
-                    l.filename().display(),
+                    l.filename().file_name(),
                     pitr_cutoff
                 );
                 result.layers_needed_by_pitr += 1;
@@ -2529,7 +2537,7 @@ impl Timeline {
                 if &l.get_lsn_range().start <= retain_lsn {
                     debug!(
                         "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                        l.filename().display(),
+                        l.filename().file_name(),
                         retain_lsn,
                         l.is_incremental(),
                     );
@@ -2562,7 +2570,7 @@ impl Timeline {
             {
                 debug!(
                     "keeping {} because it is the latest layer",
-                    l.filename().display()
+                    l.filename().file_name()
                 );
                 result.layers_not_updated += 1;
                 continue 'outer;
@@ -2571,7 +2579,7 @@ impl Timeline {
             // We didn't find any reason to keep this file, so remove it.
             debug!(
                 "garbage collecting {} is_dropped: xx is_incremental: {}",
-                l.filename().display(),
+                l.filename().file_name(),
                 l.is_incremental(),
             );
             layers_to_remove.push(Arc::clone(&l));
@@ -2580,14 +2588,13 @@ impl Timeline {
         // Actually delete the layers from disk and remove them from the map.
         // (couldn't do this in the loop above, because you cannot modify a collection
         // while iterating it. BTreeMap::retain() would be another option)
-        let mut layer_paths_to_delete = Vec::with_capacity(layers_to_remove.len());
+        let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
         for doomed_layer in layers_to_remove {
-            if let Some(path) = doomed_layer.local_path() {
-                self.metrics
-                    .current_physical_size_gauge
-                    .sub(path.metadata()?.len());
-                layer_paths_to_delete.push(path);
-            }
+            let path = doomed_layer.local_path();
+            self.metrics
+                .current_physical_size_gauge
+                .sub(path.metadata()?.len());
+            layer_names_to_delete.push(doomed_layer.filename());
             doomed_layer.delete()?;
             layers.remove_historic(doomed_layer);
             result.layers_removed += 1;
@@ -2603,7 +2610,7 @@ impl Timeline {
         }
 
         if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_paths_to_delete)?;
+            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
         }
 
         result.elapsed = now.elapsed()?;
@@ -2688,7 +2695,7 @@ impl Timeline {
 /// to an error, as anyhow context information.
 fn layer_traversal_error(
     msg: String,
-    path: Vec<(ValueReconstructResult, Lsn, Arc<dyn Layer>)>,
+    path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
 ) -> anyhow::Result<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
@@ -2697,9 +2704,7 @@ fn layer_traversal_error(
         .map(|(r, c, l)| {
             format!(
                 "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l.filename().display()
+                r, c, l,
             )
         })
         .chain(std::iter::once(msg));
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 1228f8b86e..afc413f3e3 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -411,12 +411,10 @@ def test_tenant_ignores_backup_file(
     env.postgres.stop_all()
     env.pageserver.stop()
 
-    # file is still mentioned in the index. Removing it requires more hacking on remote queue initialization
-    # Will be easier to do once there will be no .download_missing so it will be only one cycle through the layers
-    # in load_layer_map
+    # the .old file is gone from newly serialized index_part
     new_index_part = local_fs_index_part(env, tenant_id, timeline_id)
     backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"])
-    assert len(list(backup_layers)) == 1
+    assert len(list(backup_layers)) == 0
 
 
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])

From e2ae4c09a6f0eda78896a8c50fcc3db3f95adf44 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 13 Dec 2022 10:19:33 +0400
Subject: [PATCH 056/167] Put e2e tag back.

32662ff1c42a1f required running e2e tests on patched branch of cloud repo; not
that it is merged, put the tag back.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 79a95858b7..476de4953c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -506,7 +506,7 @@ jobs:
           --user "${{ secrets.CI_ACCESS_TOKEN }}" \
           --data \
             "{
-              \"ref\": \"replace-etcd\",
+              \"ref\": \"main\",
               \"inputs\": {
                 \"ci_job_name\": \"neon-cloud-e2e\",
                 \"commit_hash\": \"$COMMIT_SHA\",

From 544777e86b8b47e6040c936f44b49a1754dfe266 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 13 Dec 2022 10:36:25 +0400
Subject: [PATCH 057/167] Fix storage_broker deploy typo.

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 476de4953c..0ca4db882e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -740,7 +740,7 @@ jobs:
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
             STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY", storage_broker_config: }'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
             echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'

From d6325aa79d07c2c362ce3f61128a3b693caff12a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 13 Dec 2022 12:11:42 +0400
Subject: [PATCH 058/167] Disable body size limit in ingress broker deploy.

We have infinite streams.
---
 .github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml | 2 ++
 .github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml | 2 ++
 .../prod-ap-southeast-1-epsilon.neon-storage-broker.yaml        | 2 ++
 .../prod-eu-central-1-gamma.neon-storage-broker.yaml            | 2 ++
 .../helm-values/prod-us-east-2-delta.neon-storage-broker.yaml   | 2 ++
 .github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index d13cebead1..f139244cff 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -10,6 +10,8 @@ ingress:
     nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
     nginx.ingress.kubernetes.io/ssl-redirect: "true"
     nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # we have basically infinite streams, disable body size limit
+    nginx.ingress.kubernetes.io/proxy-body-size: "0"
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index b8b8fb055c..d59d2ebe70 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -10,6 +10,8 @@ ingress:
     nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
     nginx.ingress.kubernetes.io/ssl-redirect: "true"
     nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # we have basically infinite streams, disable body size limit
+    nginx.ingress.kubernetes.io/proxy-body-size: "0"
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index bd979e0649..9654097934 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -10,6 +10,8 @@ ingress:
     nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
     nginx.ingress.kubernetes.io/ssl-redirect: "true"
     nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # we have basically infinite streams, disable body size limit
+    nginx.ingress.kubernetes.io/proxy-body-size: "0"
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index 79cc751c65..9582327df3 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -10,6 +10,8 @@ ingress:
     nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
     nginx.ingress.kubernetes.io/ssl-redirect: "true"
     nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # we have basically infinite streams, disable body size limit
+    nginx.ingress.kubernetes.io/proxy-body-size: "0"
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index 959abea20c..7c64d4c7bd 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -10,6 +10,8 @@ ingress:
     nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
     nginx.ingress.kubernetes.io/ssl-redirect: "true"
     nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # we have basically infinite streams, disable body size limit
+    nginx.ingress.kubernetes.io/proxy-body-size: "0"
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index f41f87b7b7..1014d36264 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -10,6 +10,8 @@ ingress:
     nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
     nginx.ingress.kubernetes.io/ssl-redirect: "true"
     nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    # we have basically infinite streams, disable body size limit
+    nginx.ingress.kubernetes.io/proxy-body-size: "0"
     cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
 
   hosts:

From 7a16cde737c2caa4e5716a315bf2575cfbd41d17 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 13 Dec 2022 14:06:20 +0200
Subject: [PATCH 059/167] Remove useless pub trait method (#3076)

---
 libs/remote_storage/src/lib.rs      | 5 -----
 libs/remote_storage/src/local_fs.rs | 4 ----
 2 files changed, 9 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 04335d8f2f..3bbffd6941 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -104,11 +104,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<Download, DownloadError>;
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
-
-    /// Downcast to LocalFs implementation. For tests.
-    fn as_local(&self) -> Option<&LocalFs> {
-        None
-    }
 }
 
 pub struct Download {
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 3e2bded203..50a84eb33f 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -283,10 +283,6 @@ impl RemoteStorage for LocalFs {
             bail!("File {file_path:?} either does not exist or is not a file")
         }
     }
-
-    fn as_local(&self) -> Option<&LocalFs> {
-        Some(self)
-    }
 }
 
 fn storage_metadata_path(original_path: &Path) -> PathBuf {

From e5d523c86a2a787b3c6a84a6baffd6e8285b81ef Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 13 Dec 2022 14:11:40 +0100
Subject: [PATCH 060/167] Add new us-west-2 region (#3071)

---
 .github/ansible/prod.us-west-2.hosts.yaml | 36 +++++++++++++++++++++++
 .github/workflows/build_and_test.yml      |  2 +-
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 .github/ansible/prod.us-west-2.hosts.yaml

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
new file mode 100644
index 0000000000..d5ef761cd5
--- /dev/null
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -0,0 +1,36 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-west-2
+    bucket_region: us-west-2
+    console_mgmt_base_url: http://console-release.local
+    etcd_endpoints: etcd-0.us-west-2.aws.neon.tech:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-west-2
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2
+    console_region_id: aws-us-west-2-new
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-west-2.aws.neon.tech:
+          ansible_host: i-0d9f6dfae0e1c780d 
+        pageserver-1.us-west-2.aws.neon.tech:
+          ansible_host: i-0c834be1dddba8b3f
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-west-2.aws.neon.tech:
+          ansible_host: i-00719d8a74986fda6
+        safekeeper-1.us-west-2.aws.neon.tech:
+          ansible_host: i-074682f9d3c712e7c
+        safekeeper-2.us-west-2.aws.neon.tech:
+          ansible_host: i-042b7efb1729d7966 
+          
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0ca4db882e..8707065ef2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -891,7 +891,7 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 607c0facfc26734e6a13def384c8a1167ea378e0 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Thu, 3 Nov 2022 18:07:16 +0300
Subject: [PATCH 061/167] [proxy] Propagate more console API errors to the user

This patch aims to fix some of the inconsistencies in error reporting,
for example "Internal error" or "Console request failed" instead of
"password authentication failed for user '<NAME>'".
---
 proxy/src/auth.rs                     |  10 +-
 proxy/src/auth/backend/console.rs     | 308 ++++++++++++++++++--------
 proxy/src/auth/backend/postgres.rs    | 108 +++++----
 proxy/src/auth/flow.rs                |   6 +-
 proxy/src/error.rs                    |  20 +-
 proxy/src/http.rs                     |  10 -
 proxy/src/proxy.rs                    |  17 +-
 proxy/src/proxy/tests.rs              |  13 +-
 proxy/src/sasl.rs                     |  20 +-
 proxy/src/sasl/stream.rs              |  31 ++-
 proxy/src/scram/exchange.rs           |  23 +-
 proxy/src/scram/secret.rs             |  11 +-
 proxy/src/stream.rs                   |   4 +-
 test_runner/fixtures/neon_fixtures.py |  99 +++++----
 test_runner/regress/test_proxy.py     | 125 ++++++-----
 15 files changed, 504 insertions(+), 301 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index f272f9adc1..5355946beb 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -49,6 +49,9 @@ pub enum AuthErrorImpl {
     )]
     MissingProjectName,
 
+    #[error("password authentication failed for user '{0}'")]
+    AuthFailed(Box<str>),
+
     /// Errors produced by e.g. [`crate::stream::PqStream`].
     #[error(transparent)]
     Io(#[from] io::Error),
@@ -62,6 +65,10 @@ impl AuthError {
     pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
         AuthErrorImpl::BadAuthMethod(name.into()).into()
     }
+
+    pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
+        AuthErrorImpl::AuthFailed(user.into()).into()
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -78,10 +85,11 @@ impl UserFacingError for AuthError {
             GetAuthInfo(e) => e.to_string_client(),
             WakeCompute(e) => e.to_string_client(),
             Sasl(e) => e.to_string_client(),
+            AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
             MalformedPassword(_) => self.to_string(),
             MissingProjectName => self.to_string(),
-            _ => "Internal error".to_string(),
+            Io(_) => "Internal error".to_string(),
         }
     }
 }
diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs
index 929dfb33f7..040870fc8e 100644
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -5,26 +5,74 @@ use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
     error::{io_error, UserFacingError},
-    http, scram,
+    http, sasl, scram,
     stream::PqStream,
 };
 use futures::TryFutureExt;
-use serde::{Deserialize, Serialize};
+use reqwest::StatusCode as HttpStatusCode;
+use serde::Deserialize;
 use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, info_span};
+use tracing::{error, info, info_span, warn, Instrument};
 
+/// A go-to error message which doesn't leak any detail.
 const REQUEST_FAILED: &str = "Console request failed";
 
+/// Common console API error.
 #[derive(Debug, Error)]
-#[error("{}", REQUEST_FAILED)]
-pub struct TransportError(#[from] std::io::Error);
+pub enum ApiError {
+    /// Error returned by the console itself.
+    #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
+    Console {
+        status: HttpStatusCode,
+        text: Box<str>,
+    },
 
-impl UserFacingError for TransportError {}
+    /// Various IO errors like broken pipe or malformed payload.
+    #[error("{REQUEST_FAILED}: {0}")]
+    Transport(#[from] std::io::Error),
+}
+
+impl ApiError {
+    /// Returns HTTP status code if it's the reason for failure.
+    fn http_status_code(&self) -> Option<HttpStatusCode> {
+        use ApiError::*;
+        match self {
+            Console { status, .. } => Some(*status),
+            _ => None,
+        }
+    }
+}
+
+impl UserFacingError for ApiError {
+    fn to_string_client(&self) -> String {
+        use ApiError::*;
+        match self {
+            // To minimize risks, only select errors are forwarded to users.
+            // Ask @neondatabase/control-plane for review before adding more.
+            Console { status, .. } => match *status {
+                HttpStatusCode::NOT_FOUND => {
+                    // Status 404: failed to get a project-related resource.
+                    format!("{REQUEST_FAILED}: endpoint cannot be found")
+                }
+                HttpStatusCode::NOT_ACCEPTABLE => {
+                    // Status 406: endpoint is disabled (we don't allow connections).
+                    format!("{REQUEST_FAILED}: endpoint is disabled")
+                }
+                HttpStatusCode::LOCKED => {
+                    // Status 423: project might be in maintenance mode (or bad state).
+                    format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
+                }
+                _ => REQUEST_FAILED.to_owned(),
+            },
+            _ => REQUEST_FAILED.to_owned(),
+        }
+    }
+}
 
 // Helps eliminate graceless `.map_err` calls without introducing another ctor.
-impl From<reqwest::Error> for TransportError {
+impl From<reqwest::Error> for ApiError {
     fn from(e: reqwest::Error) -> Self {
         io_error(e).into()
     }
@@ -37,61 +85,73 @@ pub enum GetAuthInfoError {
     BadSecret,
 
     #[error(transparent)]
-    Transport(TransportError),
+    ApiError(ApiError),
+}
+
+// This allows more useful interactions than `#[from]`.
+impl<E: Into<ApiError>> From<E> for GetAuthInfoError {
+    fn from(e: E) -> Self {
+        Self::ApiError(e.into())
+    }
 }
 
 impl UserFacingError for GetAuthInfoError {
     fn to_string_client(&self) -> String {
         use GetAuthInfoError::*;
         match self {
+            // We absolutely should not leak any secrets!
             BadSecret => REQUEST_FAILED.to_owned(),
-            Transport(e) => e.to_string_client(),
+            // However, API might return a meaningful error.
+            ApiError(e) => e.to_string_client(),
         }
     }
 }
 
-impl<E: Into<TransportError>> From<E> for GetAuthInfoError {
-    fn from(e: E) -> Self {
-        Self::Transport(e.into())
-    }
-}
-
 #[derive(Debug, Error)]
 pub enum WakeComputeError {
-    // We shouldn't show users the address even if it's broken.
     #[error("Console responded with a malformed compute address: {0}")]
-    BadComputeAddress(String),
+    BadComputeAddress(Box<str>),
 
     #[error(transparent)]
-    Transport(TransportError),
+    ApiError(ApiError),
+}
+
+// This allows more useful interactions than `#[from]`.
+impl<E: Into<ApiError>> From<E> for WakeComputeError {
+    fn from(e: E) -> Self {
+        Self::ApiError(e.into())
+    }
 }
 
 impl UserFacingError for WakeComputeError {
     fn to_string_client(&self) -> String {
         use WakeComputeError::*;
         match self {
+            // We shouldn't show user the address even if it's broken.
+            // Besides, user is unlikely to care about this detail.
             BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
-            Transport(e) => e.to_string_client(),
+            // However, API might return a meaningful error.
+            ApiError(e) => e.to_string_client(),
         }
     }
 }
 
-impl<E: Into<TransportError>> From<E> for WakeComputeError {
-    fn from(e: E) -> Self {
-        Self::Transport(e.into())
-    }
+/// Console's response which holds client's auth secret.
+#[derive(Deserialize, Debug)]
+struct GetRoleSecret {
+    role_secret: Box<str>,
 }
 
-// TODO: convert into an enum with "error"
-#[derive(Serialize, Deserialize, Debug)]
-struct GetRoleSecretResponse {
-    role_secret: String,
+/// Console's response which holds compute node's `host:port` pair.
+#[derive(Deserialize, Debug)]
+struct WakeCompute {
+    address: Box<str>,
 }
 
-// TODO: convert into an enum with "error"
-#[derive(Serialize, Deserialize, Debug)]
-struct GetWakeComputeResponse {
-    address: String,
+/// Console's error response with human-readable description.
+#[derive(Deserialize, Debug)]
+struct ConsoleError {
+    error: Box<str>,
 }
 
 /// Auth secret which is managed by the cloud.
@@ -110,6 +170,12 @@ pub(super) struct Api<'a> {
     creds: &'a ClientCredentials<'a>,
 }
 
+impl<'a> AsRef<ClientCredentials<'a>> for Api<'a> {
+    fn as_ref(&self) -> &ClientCredentials<'a> {
+        self.creds
+    }
+}
+
 impl<'a> Api<'a> {
     /// Construct an API object containing the auth parameters.
     pub(super) fn new(
@@ -126,83 +192,88 @@ impl<'a> Api<'a> {
 
     /// Authenticate the existing user or throw an error.
     pub(super) async fn handle_user(
-        self,
+        &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
     ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
-        handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
+        handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
+}
 
-    async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
+impl Api<'_> {
+    async fn get_auth_info(&self) -> Result<Option<AuthInfo>, GetAuthInfoError> {
         let request_id = uuid::Uuid::new_v4().to_string();
-        let req = self
-            .endpoint
-            .get("proxy_get_role_secret")
-            .header("X-Request-ID", &request_id)
-            .query(&[("session_id", self.extra.session_id)])
-            .query(&[
-                ("application_name", self.extra.application_name),
-                ("project", Some(self.creds.project().expect("impossible"))),
-                ("role", Some(self.creds.user)),
-            ])
-            .build()?;
+        async {
+            let request = self
+                .endpoint
+                .get("proxy_get_role_secret")
+                .header("X-Request-ID", &request_id)
+                .query(&[("session_id", self.extra.session_id)])
+                .query(&[
+                    ("application_name", self.extra.application_name),
+                    ("project", Some(self.creds.project().expect("impossible"))),
+                    ("role", Some(self.creds.user)),
+                ])
+                .build()?;
 
-        let span = info_span!("http", id = request_id, url = req.url().as_str());
-        info!(parent: &span, "request auth info");
-        let msg = self
-            .endpoint
-            .checked_execute(req)
-            .and_then(|r| r.json::<GetRoleSecretResponse>())
-            .await
-            .map_err(|e| {
-                error!(parent: &span, "{e}");
-                e
-            })?;
+            info!(url = request.url().as_str(), "sending http request");
+            let response = self.endpoint.execute(request).await?;
+            let body = match parse_body::<GetRoleSecret>(response).await {
+                Ok(body) => body,
+                // Error 404 is special: it's ok not to have a secret.
+                Err(e) => match e.http_status_code() {
+                    Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
+                    _otherwise => return Err(e.into()),
+                },
+            };
 
-        scram::ServerSecret::parse(&msg.role_secret)
-            .map(AuthInfo::Scram)
-            .ok_or(GetAuthInfoError::BadSecret)
+            let secret = scram::ServerSecret::parse(&body.role_secret)
+                .map(AuthInfo::Scram)
+                .ok_or(GetAuthInfoError::BadSecret)?;
+
+            Ok(Some(secret))
+        }
+        .map_err(crate::error::log_error)
+        .instrument(info_span!("get_auth_info", id = request_id))
+        .await
     }
 
     /// Wake up the compute node and return the corresponding connection info.
-    pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
-        let req = self
-            .endpoint
-            .get("proxy_wake_compute")
-            .header("X-Request-ID", &request_id)
-            .query(&[("session_id", self.extra.session_id)])
-            .query(&[
-                ("application_name", self.extra.application_name),
-                ("project", Some(self.creds.project().expect("impossible"))),
-            ])
-            .build()?;
+        async {
+            let request = self
+                .endpoint
+                .get("proxy_wake_compute")
+                .header("X-Request-ID", &request_id)
+                .query(&[("session_id", self.extra.session_id)])
+                .query(&[
+                    ("application_name", self.extra.application_name),
+                    ("project", Some(self.creds.project().expect("impossible"))),
+                ])
+                .build()?;
 
-        let span = info_span!("http", id = request_id, url = req.url().as_str());
-        info!(parent: &span, "request wake-up");
-        let msg = self
-            .endpoint
-            .checked_execute(req)
-            .and_then(|r| r.json::<GetWakeComputeResponse>())
-            .await
-            .map_err(|e| {
-                error!(parent: &span, "{e}");
-                e
-            })?;
+            info!(url = request.url().as_str(), "sending http request");
+            let response = self.endpoint.execute(request).await?;
+            let body = parse_body::<WakeCompute>(response).await?;
 
-        // Unfortunately, ownership won't let us use `Option::ok_or` here.
-        let (host, port) = match parse_host_port(&msg.address) {
-            None => return Err(WakeComputeError::BadComputeAddress(msg.address)),
-            Some(x) => x,
-        };
+            // Unfortunately, ownership won't let us use `Option::ok_or` here.
+            let (host, port) = match parse_host_port(&body.address) {
+                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
+                Some(x) => x,
+            };
 
-        let mut config = compute::ConnCfg::new();
-        config
-            .host(host)
-            .port(port)
-            .dbname(self.creds.dbname)
-            .user(self.creds.user);
+            let mut config = compute::ConnCfg::new();
+            config
+                .host(host)
+                .port(port)
+                .dbname(self.creds.dbname)
+                .user(self.creds.user);
 
-        Ok(config)
+            Ok(config)
+        }
+        .map_err(crate::error::log_error)
+        .instrument(info_span!("wake_compute", id = request_id))
+        .await
     }
 }
 
@@ -215,24 +286,40 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
     wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
 ) -> auth::Result<AuthSuccess<compute::ConnCfg>>
 where
-    GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
+    Endpoint: AsRef<ClientCredentials<'a>>,
+    GetAuthInfo: Future<Output = Result<Option<AuthInfo>, GetAuthInfoError>>,
     WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
 {
+    let creds = endpoint.as_ref();
+
     info!("fetching user's authentication info");
-    let auth_info = get_auth_info(endpoint).await?;
+    let info = get_auth_info(endpoint).await?.unwrap_or_else(|| {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
+    });
 
     let flow = AuthFlow::new(client);
-    let scram_keys = match auth_info {
+    let scram_keys = match info {
         AuthInfo::Md5(_) => {
-            // TODO: decide if we should support MD5 in api v2
             info!("auth endpoint chooses MD5");
             return Err(auth::AuthError::bad_auth_method("MD5"));
         }
         AuthInfo::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
             let scram = auth::Scram(&secret);
+            let client_key = match flow.begin(scram).await?.authenticate().await? {
+                sasl::Outcome::Success(key) => key,
+                sasl::Outcome::Failure(reason) => {
+                    info!("auth backend failed with an error: {reason}");
+                    return Err(auth::AuthError::auth_failed(creds.user));
+                }
+            };
+
             Some(compute::ScramKeys {
-                client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
+                client_key: client_key.as_bytes(),
                 server_key: secret.server_key.as_bytes(),
             })
         }
@@ -249,6 +336,31 @@ where
     })
 }
 
+/// Parse http response body, taking status code into account.
+async fn parse_body<T: for<'a> Deserialize<'a>>(
+    response: reqwest::Response,
+) -> Result<T, ApiError> {
+    let status = response.status();
+    if status.is_success() {
+        // We shouldn't log raw body because it may contain secrets.
+        info!("request succeeded, processing the body");
+        return Ok(response.json().await?);
+    }
+
+    // Don't throw an error here because it's not as important
+    // as the fact that the request itself has failed.
+    let body = response.json().await.unwrap_or_else(|e| {
+        warn!("failed to parse error body: {e}");
+        ConsoleError {
+            error: "reason unclear (malformed error message)".into(),
+        }
+    });
+
+    let text = body.error;
+    error!("console responded with an error ({status}): {text}");
+    Err(ApiError::Console { status, text })
+}
+
 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
     let (host, port) = input.split_once(':')?;
     Some((host, port.parse().ok()?))
diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs
index e56b62622a..8f16dc9fa8 100644
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -1,7 +1,7 @@
 //! Local mock of Cloud API V2.
 
 use super::{
-    console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
+    console::{self, AuthInfo, GetAuthInfoError, WakeComputeError},
     AuthSuccess,
 };
 use crate::{
@@ -12,7 +12,28 @@ use crate::{
     stream::PqStream,
     url::ApiUrl,
 };
+use futures::TryFutureExt;
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, info_span, warn, Instrument};
+
+#[derive(Debug, Error)]
+enum MockApiError {
+    #[error("Failed to read password: {0}")]
+    PasswordNotSet(tokio_postgres::Error),
+}
+
+impl From<MockApiError> for console::ApiError {
+    fn from(e: MockApiError) -> Self {
+        io_error(e).into()
+    }
+}
+
+impl From<tokio_postgres::Error> for console::ApiError {
+    fn from(e: tokio_postgres::Error) -> Self {
+        io_error(e).into()
+    }
+}
 
 #[must_use]
 pub(super) struct Api<'a> {
@@ -20,10 +41,9 @@ pub(super) struct Api<'a> {
     creds: &'a ClientCredentials<'a>,
 }
 
-// Helps eliminate graceless `.map_err` calls without introducing another ctor.
-impl From<tokio_postgres::Error> for TransportError {
-    fn from(e: tokio_postgres::Error) -> Self {
-        io_error(e).into()
+impl<'a> AsRef<ClientCredentials<'a>> for Api<'a> {
+    fn as_ref(&self) -> &ClientCredentials<'a> {
+        self.creds
     }
 }
 
@@ -35,54 +55,55 @@ impl<'a> Api<'a> {
 
     /// Authenticate the existing user or throw an error.
     pub(super) async fn handle_user(
-        self,
+        &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
     ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
         // We reuse user handling logic from a production module.
-        console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
+        console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
+}
 
+impl Api<'_> {
     /// This implementation fetches the auth info from a local postgres instance.
-    async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
-        // Perhaps we could persist this connection, but then we'd have to
-        // write more code for reopening it if it got closed, which doesn't
-        // seem worth it.
-        let (client, connection) =
-            tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
+    async fn get_auth_info(&self) -> Result<Option<AuthInfo>, GetAuthInfoError> {
+        async {
+            // Perhaps we could persist this connection, but then we'd have to
+            // write more code for reopening it if it got closed, which doesn't
+            // seem worth it.
+            let (client, connection) =
+                tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
-        tokio::spawn(connection);
-        let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
-        let rows = client.query(query, &[&self.creds.user]).await?;
+            tokio::spawn(connection);
+            let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
+            let rows = client.query(query, &[&self.creds.user]).await?;
 
-        match &rows[..] {
-            // We can't get a secret if there's no such user.
-            [] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()),
+            // We can get at most one row, because `rolname` is unique.
+            let row = match rows.get(0) {
+                Some(row) => row,
+                // This means that the user doesn't exist, so there can be no secret.
+                // However, this is still a *valid* outcome which is very similar
+                // to getting `404 Not found` from the Neon console.
+                None => {
+                    warn!("user '{}' does not exist", self.creds.user);
+                    return Ok(None);
+                }
+            };
 
-            // We shouldn't get more than one row anyway.
-            [row, ..] => {
-                let entry = row
-                    .try_get("rolpassword")
-                    .map_err(|e| io_error(format!("failed to read user's password: {e}")))?;
+            let entry = row
+                .try_get("rolpassword")
+                .map_err(MockApiError::PasswordNotSet)?;
 
-                scram::ServerSecret::parse(entry)
-                    .map(AuthInfo::Scram)
-                    .or_else(|| {
-                        // It could be an md5 hash if it's not a SCRAM secret.
-                        let text = entry.strip_prefix("md5")?;
-                        Some(AuthInfo::Md5({
-                            let mut bytes = [0u8; 16];
-                            hex::decode_to_slice(text, &mut bytes).ok()?;
-                            bytes
-                        }))
-                    })
-                    // Putting the secret into this message is a security hazard!
-                    .ok_or(GetAuthInfoError::BadSecret)
-            }
+            info!("got a secret: {entry}"); // safe since it's not a prod scenario
+            let secret = scram::ServerSecret::parse(entry).map(AuthInfo::Scram);
+            Ok(secret.or_else(|| parse_md5(entry).map(AuthInfo::Md5)))
         }
+        .map_err(crate::error::log_error)
+        .instrument(info_span!("get_auth_info", mock = self.endpoint.as_str()))
+        .await
     }
 
     /// We don't need to wake anything locally, so we just return the connection info.
-    pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
         let mut config = compute::ConnCfg::new();
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
@@ -93,3 +114,12 @@ impl<'a> Api<'a> {
         Ok(config)
     }
 }
+
+fn parse_md5(input: &str) -> Option<[u8; 16]> {
+    let text = input.strip_prefix("md5")?;
+
+    let mut bytes = [0u8; 16];
+    hex::decode_to_slice(text, &mut bytes).ok()?;
+
+    Some(bytes)
+}
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 865af4d2e5..d9ee50894d 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -89,7 +89,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<scram::ScramKey> {
+    pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
@@ -101,10 +101,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         }
 
         let secret = self.state.0;
-        let key = sasl::SaslStream::new(self.stream, sasl.message)
+        let outcome = sasl::SaslStream::new(self.stream, sasl.message)
             .authenticate(scram::Exchange::new(secret, rand::random, None))
             .await?;
 
-        Ok(key)
+        Ok(outcome)
     }
 }
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 0e376a37cd..f1cb44b1a8 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,4 +1,15 @@
-use std::io;
+use std::{error::Error as StdError, fmt, io};
+
+/// Upcast (almost) any error into an opaque [`io::Error`].
+pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
+    io::Error::new(io::ErrorKind::Other, e)
+}
+
+/// A small combinator for pluggable error logging.
+pub fn log_error<E: fmt::Display>(e: E) -> E {
+    tracing::error!("{e}");
+    e
+}
 
 /// Marks errors that may be safely shown to a client.
 /// This trait can be seen as a specialized version of [`ToString`].
@@ -6,7 +17,7 @@ use std::io;
 /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
 /// is way too convenient and tends to proliferate all across the codebase,
 /// ultimately leading to accidental leaks of sensitive data.
-pub trait UserFacingError: ToString {
+pub trait UserFacingError: fmt::Display {
     /// Format the error for client, stripping all sensitive info.
     ///
     /// Although this might be a no-op for many types, it's highly
@@ -17,8 +28,3 @@ pub trait UserFacingError: ToString {
         self.to_string()
     }
 }
-
-/// Upcast (almost) any error into an opaque [`io::Error`].
-pub fn io_error(e: impl Into<Box<dyn std::error::Error + Send + Sync>>) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e)
-}
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 6f9145678b..096a33d73d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -37,16 +37,6 @@ impl Endpoint {
     ) -> Result<reqwest::Response, reqwest::Error> {
         self.client.execute(request).await
     }
-
-    /// Execute a [request](reqwest::Request) and raise an error if status != 200.
-    pub async fn checked_execute(
-        &self,
-        request: reqwest::Request,
-    ) -> Result<reqwest::Response, reqwest::Error> {
-        self.execute(request)
-            .await
-            .and_then(|r| r.error_for_status())
-    }
 }
 
 #[cfg(test)]
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 411893fee5..da3cb144e3 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -49,17 +49,6 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-/// A small combinator for pluggable error logging.
-async fn log_error<R, F>(future: F) -> F::Output
-where
-    F: std::future::Future<Output = anyhow::Result<R>>,
-{
-    future.await.map_err(|err| {
-        error!("{err}");
-        err
-    })
-}
-
 pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
@@ -80,7 +69,7 @@ pub async fn task_main(
         let session_id = uuid::Uuid::new_v4();
         let cancel_map = Arc::clone(&cancel_map);
         tokio::spawn(
-            log_error(async move {
+            async move {
                 info!("spawned a task for {peer_addr}");
 
                 socket
@@ -88,6 +77,10 @@ pub async fn task_main(
                     .context("failed to set socket option")?;
 
                 handle_client(config, &cancel_map, session_id, socket).await
+            }
+            .unwrap_or_else(|e| {
+                // Acknowledge that the task has finished with an error.
+                error!("per-client task finished with an error: {e:#}");
             })
             .instrument(info_span!("client", session = format_args!("{session_id}"))),
         );
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3d74dbae5a..24fbc57b99 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,6 +1,6 @@
 ///! A group of high-level tests for connection establishing logic and auth.
 use super::*;
-use crate::{auth, scram};
+use crate::{auth, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -100,8 +100,7 @@ impl Scram {
     }
 
     fn mock(user: &str) -> Self {
-        let salt = rand::random::<[u8; 32]>();
-        Scram(scram::ServerSecret::mock(user, &salt))
+        Scram(scram::ServerSecret::mock(user, rand::random()))
     }
 }
 
@@ -111,13 +110,17 @@ impl TestAuth for Scram {
         self,
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
-        auth::AuthFlow::new(stream)
+        let outcome = auth::AuthFlow::new(stream)
             .begin(auth::Scram(&self.0))
             .await?
             .authenticate()
             .await?;
 
-        Ok(())
+        use sasl::Outcome::*;
+        match outcome {
+            Success(_) => Ok(()),
+            Failure(reason) => bail!("autentication failed with an error: {reason}"),
+        }
     }
 }
 
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 689fca6049..6d1dd9fba5 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -16,22 +16,19 @@ use thiserror::Error;
 
 pub use channel_binding::ChannelBinding;
 pub use messages::FirstMessage;
-pub use stream::SaslStream;
+pub use stream::{Outcome, SaslStream};
 
 /// Fine-grained auth errors help in writing tests.
 #[derive(Error, Debug)]
 pub enum Error {
-    #[error("Failed to authenticate client: {0}")]
-    AuthenticationFailed(&'static str),
-
     #[error("Channel binding failed: {0}")]
     ChannelBindingFailed(&'static str),
 
     #[error("Unsupported channel binding method: {0}")]
     ChannelBindingBadMethod(Box<str>),
 
-    #[error("Bad client message")]
-    BadClientMessage,
+    #[error("Bad client message: {0}")]
+    BadClientMessage(&'static str),
 
     #[error(transparent)]
     Io(#[from] io::Error),
@@ -41,8 +38,6 @@ impl UserFacingError for Error {
     fn to_string_client(&self) -> String {
         use Error::*;
         match self {
-            // This constructor contains the reason why auth has failed.
-            AuthenticationFailed(s) => s.to_string(),
             // TODO: add support for channel binding
             ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(),
             ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
@@ -55,11 +50,14 @@ impl UserFacingError for Error {
 pub type Result<T> = std::result::Result<T, Error>;
 
 /// A result of one SASL exchange.
+#[must_use]
 pub enum Step<T, R> {
     /// We should continue exchanging messages.
-    Continue(T),
+    Continue(T, String),
     /// The client has been authenticated successfully.
-    Authenticated(R),
+    Success(R, String),
+    /// Authentication failed (reason attached).
+    Failure(&'static str),
 }
 
 /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait.
@@ -69,5 +67,5 @@ pub trait Mechanism: Sized {
 
     /// Produce a server challenge to be sent to the client.
     /// This is how this method is called in PostgreSQL (`libpq/sasl.h`).
-    fn exchange(self, input: &str) -> Result<(Step<Self, Self::Output>, String)>;
+    fn exchange(self, input: &str) -> Result<Step<Self, Self::Output>>;
 }
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index 0e782c5f29..b24cc4bf44 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -48,28 +48,41 @@ impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
     }
 }
 
+/// SASL authentication outcome.
+/// It's much easier to match on those two variants
+/// than to peek into a noisy protocol error type.
+#[must_use = "caller must explicitly check for success"]
+pub enum Outcome<R> {
+    /// Authentication succeeded and produced some value.
+    Success(R),
+    /// Authentication failed (reason attached).
+    Failure(&'static str),
+}
+
 impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
     /// Perform SASL message exchange according to the underlying algorithm
     /// until user is either authenticated or denied access.
     pub async fn authenticate<M: Mechanism>(
         mut self,
         mut mechanism: M,
-    ) -> super::Result<M::Output> {
+    ) -> super::Result<Outcome<M::Output>> {
         loop {
             let input = self.recv().await?;
-            let (moved, reply) = mechanism.exchange(input)?;
+            let step = mechanism.exchange(input)?;
 
-            use super::Step::*;
-            match moved {
-                Continue(moved) => {
+            use super::Step;
+            return Ok(match step {
+                Step::Continue(moved_mechanism, reply) => {
                     self.send(&ServerMessage::Continue(&reply)).await?;
-                    mechanism = moved;
+                    mechanism = moved_mechanism;
+                    continue;
                 }
-                Authenticated(result) => {
+                Step::Success(result, reply) => {
                     self.send(&ServerMessage::Final(&reply)).await?;
-                    return Ok(result);
+                    Outcome::Success(result)
                 }
-            }
+                Step::Failure(reason) => Outcome::Failure(reason),
+            });
         }
     }
 }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index fca5585b25..882769a70d 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -64,12 +64,12 @@ impl<'a> Exchange<'a> {
 impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
-    fn exchange(mut self, input: &str) -> sasl::Result<(sasl::Step<Self, Self::Output>, String)> {
+    fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
         use {sasl::Step::*, ExchangeState::*};
         match &self.state {
             Initial => {
-                let client_first_message =
-                    ClientFirstMessage::parse(input).ok_or(SaslError::BadClientMessage)?;
+                let client_first_message = ClientFirstMessage::parse(input)
+                    .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
 
                 let server_first_message = client_first_message.build_server_first_message(
                     &(self.nonce)(),
@@ -84,15 +84,15 @@ impl sasl::Mechanism for Exchange<'_> {
                     server_first_message,
                 };
 
-                Ok((Continue(self), msg))
+                Ok(Continue(self, msg))
             }
             SaltSent {
                 cbind_flag,
                 client_first_message_bare,
                 server_first_message,
             } => {
-                let client_final_message =
-                    ClientFinalMessage::parse(input).ok_or(SaslError::BadClientMessage)?;
+                let client_final_message = ClientFinalMessage::parse(input)
+                    .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
 
                 let channel_binding = cbind_flag.encode(|_| {
                     self.cert_digest
@@ -106,9 +106,7 @@ impl sasl::Mechanism for Exchange<'_> {
                 }
 
                 if client_final_message.nonce != server_first_message.nonce() {
-                    return Err(SaslError::AuthenticationFailed(
-                        "combined nonce doesn't match",
-                    ));
+                    return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
                 }
 
                 let signature_builder = SignatureBuilder {
@@ -121,14 +119,15 @@ impl sasl::Mechanism for Exchange<'_> {
                     .build(&self.secret.stored_key)
                     .derive_client_key(&client_final_message.proof);
 
-                if client_key.sha256() != self.secret.stored_key {
-                    return Err(SaslError::AuthenticationFailed("password doesn't match"));
+                // Auth fails either if keys don't match or it's pre-determined to fail.
+                if client_key.sha256() != self.secret.stored_key || self.secret.doomed {
+                    return Ok(Failure("password doesn't match"));
                 }
 
                 let msg = client_final_message
                     .build_server_final_message(signature_builder, &self.secret.server_key);
 
-                Ok((Authenticated(client_key), msg))
+                Ok(Success(client_key, msg))
             }
         }
     }
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 765aef4443..89668465fa 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -14,6 +14,9 @@ pub struct ServerSecret {
     pub stored_key: ScramKey,
     /// Used by client to verify server's signature.
     pub server_key: ScramKey,
+    /// Should auth fail no matter what?
+    /// This is exactly the case for mocked secrets.
+    pub doomed: bool,
 }
 
 impl ServerSecret {
@@ -30,6 +33,7 @@ impl ServerSecret {
             salt_base64: salt.to_owned(),
             stored_key: base64_decode_array(stored_key)?.into(),
             server_key: base64_decode_array(server_key)?.into(),
+            doomed: false,
         };
 
         Some(secret)
@@ -38,16 +42,16 @@ impl ServerSecret {
     /// To avoid revealing information to an attacker, we use a
     /// mocked server secret even if the user doesn't exist.
     /// See `auth-scram.c : mock_scram_secret` for details.
-    #[allow(dead_code)]
-    pub fn mock(user: &str, nonce: &[u8; 32]) -> Self {
+    pub fn mock(user: &str, nonce: [u8; 32]) -> Self {
         // Refer to `auth-scram.c : scram_mock_salt`.
-        let mocked_salt = super::sha256([user.as_bytes(), nonce]);
+        let mocked_salt = super::sha256([user.as_bytes(), &nonce]);
 
         Self {
             iterations: 4096,
             salt_base64: base64::encode(&mocked_salt),
             stored_key: ScramKey::default(),
             server_key: ScramKey::default(),
+            doomed: true,
         }
     }
 
@@ -67,6 +71,7 @@ impl ServerSecret {
             salt_base64: base64::encode(&salt),
             stored_key: password.client_key().sha256(),
             server_key: password.server_key(),
+            doomed: false,
         })
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 8e4084775c..19e1479068 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -109,8 +109,9 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
 
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Allowing string literals is safe under the assumption they might not contain any runtime info.
+    /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
     pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
-        // This method exists due to `&str` not implementing `Into<anyhow::Error>`
+        tracing::info!("forwarding error to user: {error}");
         self.write_message(&BeMessage::ErrorResponse(error)).await?;
         bail!(error)
     }
@@ -122,6 +123,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         E: UserFacingError + Into<anyhow::Error>,
     {
         let msg = error.to_string_client();
+        tracing::info!("forwarding error to user: {msg}");
         self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
         bail!(error)
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0d64ca6d65..e3f8247274 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2092,62 +2092,73 @@ class PSQL:
 
 
 class NeonProxy(PgProtocol):
+    link_auth_uri: str = "http://dummy-uri"
+
+    class AuthBackend(abc.ABC):
+        """All auth backends must inherit from this class"""
+
+        @property
+        def default_conn_url(self) -> Optional[str]:
+            return None
+
+        @abc.abstractmethod
+        def extra_args(self) -> list[str]:
+            pass
+
+    class Link(AuthBackend):
+        def extra_args(self) -> list[str]:
+            return [
+                # Link auth backend params
+                *["--auth-backend", "link"],
+                *["--uri", NeonProxy.link_auth_uri],
+            ]
+
+    @dataclass(frozen=True)
+    class Postgres(AuthBackend):
+        pg_conn_url: str
+
+        @property
+        def default_conn_url(self) -> Optional[str]:
+            return self.pg_conn_url
+
+        def extra_args(self) -> list[str]:
+            return [
+                # Postgres auth backend params
+                *["--auth-backend", "postgres"],
+                *["--auth-endpoint", self.pg_conn_url],
+            ]
+
     def __init__(
         self,
+        neon_binpath: Path,
         proxy_port: int,
         http_port: int,
         mgmt_port: int,
-        neon_binpath: Path,
-        auth_endpoint=None,
+        auth_backend: NeonProxy.AuthBackend,
     ):
-        super().__init__(dsn=auth_endpoint, port=proxy_port)
-        self.host = "127.0.0.1"
+        host = "127.0.0.1"
+        super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port)
+
+        self.host = host
         self.http_port = http_port
         self.neon_binpath = neon_binpath
         self.proxy_port = proxy_port
         self.mgmt_port = mgmt_port
-        self.auth_endpoint = auth_endpoint
+        self.auth_backend = auth_backend
         self._popen: Optional[subprocess.Popen[bytes]] = None
-        self.link_auth_uri_prefix = "http://dummy-uri"
 
-    def start(self):
-        """
-        Starts a proxy with option '--auth-backend postgres' and a postgres instance
-        already provided though '--auth-endpoint <postgress-instance>'."
-        """
+    def start(self) -> NeonProxy:
         assert self._popen is None
-        assert self.auth_endpoint is not None
-
-        # Start proxy
         args = [
             str(self.neon_binpath / "proxy"),
             *["--http", f"{self.host}:{self.http_port}"],
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
-            *["--auth-backend", "postgres"],
-            *["--auth-endpoint", self.auth_endpoint],
+            *self.auth_backend.extra_args(),
         ]
         self._popen = subprocess.Popen(args)
         self._wait_until_ready()
-
-    def start_with_link_auth(self):
-        """
-        Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'."
-        """
-        assert self._popen is None
-
-        # Start proxy
-        bin_proxy = str(self.neon_binpath / "proxy")
-        args = [bin_proxy]
-        args.extend(["--http", f"{self.host}:{self.http_port}"])
-        args.extend(["--proxy", f"{self.host}:{self.proxy_port}"])
-        args.extend(["--mgmt", f"{self.host}:{self.mgmt_port}"])
-        args.extend(["--auth-backend", "link"])
-        args.extend(["--uri", self.link_auth_uri_prefix])
-        arg_str = " ".join(args)
-        log.info(f"starting proxy with command line ::: {arg_str}")
-        self._popen = subprocess.Popen(args, stdout=subprocess.PIPE)
-        self._wait_until_ready()
+        return self
 
     @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
     def _wait_until_ready(self):
@@ -2158,7 +2169,7 @@ class NeonProxy(PgProtocol):
         request_result.raise_for_status()
         return request_result.text
 
-    def __enter__(self) -> "NeonProxy":
+    def __enter__(self) -> NeonProxy:
         return self
 
     def __exit__(
@@ -2176,11 +2187,19 @@ class NeonProxy(PgProtocol):
 @pytest.fixture(scope="function")
 def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]:
     """Neon proxy that routes through link auth."""
+
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
-    with NeonProxy(proxy_port, http_port, neon_binpath=neon_binpath, mgmt_port=mgmt_port) as proxy:
-        proxy.start_with_link_auth()
+
+    with NeonProxy(
+        neon_binpath=neon_binpath,
+        proxy_port=proxy_port,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        auth_backend=NeonProxy.Link(),
+    ) as proxy:
+        proxy.start()
         yield proxy
 
 
@@ -2204,11 +2223,11 @@ def static_proxy(
     http_port = port_distributor.get_port()
 
     with NeonProxy(
+        neon_binpath=neon_binpath,
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
-        neon_binpath=neon_binpath,
-        auth_endpoint=auth_endpoint,
+        auth_backend=NeonProxy.Postgres(auth_endpoint),
     ) as proxy:
         proxy.start()
         yield proxy
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index e868d6b616..eab9505fbb 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -28,61 +28,58 @@ def test_password_hack(static_proxy: NeonProxy):
         static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
 
 
-def get_session_id(uri_prefix, uri_line):
-    assert uri_prefix in uri_line
-
-    url_parts = urlparse(uri_line)
-    psql_session_id = url_parts.path[1:]
-    assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars"
-
-    return psql_session_id
-
-
-async def find_auth_link(link_auth_uri_prefix, proc):
-    for _ in range(100):
-        line = (await proc.stderr.readline()).decode("utf-8").strip()
-        log.info(f"psql line: {line}")
-        if link_auth_uri_prefix in line:
-            log.info(f"SUCCESS, found auth url: {line}")
-            return line
-
-
-async def activate_link_auth(local_vanilla_pg, link_proxy, psql_session_id):
-    pg_user = "proxy"
-
-    log.info("creating a new user for link auth test")
-    local_vanilla_pg.start()
-    local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser")
-
-    db_info = json.dumps(
-        {
-            "session_id": psql_session_id,
-            "result": {
-                "Success": {
-                    "host": local_vanilla_pg.default_options["host"],
-                    "port": local_vanilla_pg.default_options["port"],
-                    "dbname": local_vanilla_pg.default_options["dbname"],
-                    "user": pg_user,
-                    "project": "irrelevant",
-                }
-            },
-        }
-    )
-
-    log.info("sending session activation message")
-    psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info)
-    out = (await psql.stdout.read()).decode("utf-8").strip()
-    assert out == "ok"
-
-
 @pytest.mark.asyncio
 async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
+    def get_session_id(uri_prefix, uri_line):
+        assert uri_prefix in uri_line
+
+        url_parts = urlparse(uri_line)
+        psql_session_id = url_parts.path[1:]
+        assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars"
+
+        return psql_session_id
+
+    async def find_auth_link(link_auth_uri, proc):
+        for _ in range(100):
+            line = (await proc.stderr.readline()).decode("utf-8").strip()
+            log.info(f"psql line: {line}")
+            if link_auth_uri in line:
+                log.info(f"SUCCESS, found auth url: {line}")
+                return line
+
+    async def activate_link_auth(local_vanilla_pg, link_proxy, psql_session_id):
+        pg_user = "proxy"
+
+        log.info("creating a new user for link auth test")
+        local_vanilla_pg.start()
+        local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser")
+
+        db_info = json.dumps(
+            {
+                "session_id": psql_session_id,
+                "result": {
+                    "Success": {
+                        "host": local_vanilla_pg.default_options["host"],
+                        "port": local_vanilla_pg.default_options["port"],
+                        "dbname": local_vanilla_pg.default_options["dbname"],
+                        "user": pg_user,
+                        "project": "irrelevant",
+                    }
+                },
+            }
+        )
+
+        log.info("sending session activation message")
+        psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info)
+        out = (await psql.stdout.read()).decode("utf-8").strip()
+        assert out == "ok"
+
     psql = await PSQL(host=link_proxy.host, port=link_proxy.proxy_port).run("select 42")
 
-    uri_prefix = link_proxy.link_auth_uri_prefix
-    link = await find_auth_link(uri_prefix, psql)
+    base_uri = link_proxy.link_auth_uri
+    link = await find_auth_link(base_uri, psql)
 
-    psql_session_id = get_session_id(uri_prefix, link)
+    psql_session_id = get_session_id(base_uri, link)
     await activate_link_auth(vanilla_pg, link_proxy, psql_session_id)
 
     assert psql.stdout is not None
@@ -97,3 +94,31 @@ def test_proxy_options(static_proxy: NeonProxy):
             cur.execute("SHOW proxytest.option")
             value = cur.fetchall()[0][0]
             assert value == "value"
+
+
+def test_auth_errors(static_proxy: NeonProxy):
+    # User does not exist
+    with pytest.raises(psycopg2.Error) as exprinfo:
+        static_proxy.connect(user="pinocchio", options="project=irrelevant")
+    text = str(exprinfo.value).strip()
+    assert text.endswith("password authentication failed for user 'pinocchio'")
+
+    static_proxy.safe_psql(
+        "create role pinocchio with login password 'magic'", options="project=irrelevant"
+    )
+
+    # User exists, but password is missing
+    with pytest.raises(psycopg2.Error) as exprinfo:
+        static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant")
+    text = str(exprinfo.value).strip()
+    assert text.endswith("password authentication failed for user 'pinocchio'")
+
+    # User exists, but password is wrong
+    with pytest.raises(psycopg2.Error) as exprinfo:
+        static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant")
+    text = str(exprinfo.value).strip()
+    assert text.endswith("password authentication failed for user 'pinocchio'")
+
+    # Finally, check that the user can connect
+    with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
+        pass

From 02c1c351dc4fe7ffb99d9b3e69e10c837f6548f3 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 13 Dec 2022 15:42:59 +0200
Subject: [PATCH 062/167] Create initial timeline without remote storage
 (#3077)

Removes the race during pageserver initial timeline creation that lead to partial layer uploads.
This race is only reproducible in test code, we do not create initial timelines in cloud (yet, at least), but still nice to remove the non-deterministic behavior.
---
 control_plane/src/background_process.rs |  12 +-
 control_plane/src/bin/neon_local.rs     |   2 +-
 control_plane/src/pageserver.rs         | 147 +++++++++++++++++-------
 libs/remote_storage/src/lib.rs          |   9 +-
 pageserver/src/config.rs                |   2 +-
 safekeeper/src/bin/safekeeper.rs        |   9 +-
 6 files changed, 126 insertions(+), 55 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 1a5ac1e2fe..8909e27c94 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -51,21 +51,21 @@ pub enum InitialPidFile<'t> {
 }
 
 /// Start a background child process using the parameters given.
-pub fn start_process<
-    F,
-    S: AsRef<OsStr>,
-    EI: IntoIterator<Item = (String, String)>, // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
->(
+pub fn start_process<F, AI, A, EI>(
     process_name: &str,
     datadir: &Path,
     command: &Path,
-    args: &[S],
+    args: AI,
     envs: EI,
     initial_pid_file: InitialPidFile,
     process_status_check: F,
 ) -> anyhow::Result<Child>
 where
     F: Fn() -> anyhow::Result<bool>,
+    AI: IntoIterator<Item = A>,
+    A: AsRef<OsStr>,
+    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
+    EI: IntoIterator<Item = (String, String)>,
 {
     let log_path = datadir.join(format!("{process_name}.log"));
     let process_log_file = fs::OpenOptions::new()
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6f059d535e..f0c3b983f0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -341,7 +341,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
         .get_many::<String>("pageserver-config-override")
         .into_iter()
         .flatten()
-        .map(|s| s.as_str())
+        .map(String::as_str)
         .collect()
 }
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 51e540e39c..3575e75db9 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,9 +1,10 @@
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
-use std::path::{Path, PathBuf};
-use std::process::Child;
+use std::path::PathBuf;
+use std::process::{Child, Command};
 use std::{io, result};
 
 use anyhow::{bail, ensure, Context};
@@ -129,6 +130,8 @@ impl PageServerNode {
         overrides
     }
 
+    /// Initializes a pageserver node by creating its config with the overrides provided,
+    /// and creating an initial tenant and timeline afterwards.
     pub fn initialize(
         &self,
         create_tenant: Option<TenantId>,
@@ -136,11 +139,28 @@ impl PageServerNode {
         config_overrides: &[&str],
         pg_version: u32,
     ) -> anyhow::Result<TimelineId> {
+        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
+        self.pageserver_init(config_overrides).with_context(|| {
+            format!(
+                "Failed to run init for pageserver node {}",
+                self.env.pageserver.id,
+            )
+        })?;
+
+        // Then, briefly start it fully to run HTTP commands on it,
+        // to create initial tenant and timeline.
+        // We disable the remote storage, since we stop pageserver right after the timeline creation,
+        // hence most of the uploads will either aborted or not started: no point to start them at all.
+        let disabled_remote_storage_override = "remote_storage={}";
         let mut pageserver_process = self
-            .start_node(config_overrides, &self.env.base_data_dir, true)
+            .start_node(
+                &[disabled_remote_storage_override],
+                // Previous overrides will be taken from the config created before, don't overwrite them.
+                false,
+            )
             .with_context(|| {
                 format!(
-                    "Failed to start a process for pageserver {}",
+                    "Failed to start a process for pageserver node {}",
                     self.env.pageserver.id,
                 )
             })?;
@@ -201,55 +221,73 @@ impl PageServerNode {
     }
 
     pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, &self.repo_path(), false)
+        self.start_node(config_overrides, false)
     }
 
-    fn start_node(
-        &self,
-        config_overrides: &[&str],
-        datadir: &Path,
-        update_config: bool,
-    ) -> anyhow::Result<Child> {
-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
-
-        print!(
-            "Starting pageserver at '{}' in '{}'",
+    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        let datadir = self.repo_path();
+        let node_id = self.env.pageserver.id;
+        println!(
+            "Initializing pageserver node {} at '{}' in {:?}",
+            node_id,
             self.pg_connection_config.raw_address(),
-            datadir.display()
+            datadir
         );
         io::stdout().flush()?;
 
-        let mut args = vec![
-            "-D",
-            datadir.to_str().with_context(|| {
-                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-            })?,
-        ];
+        let datadir_path_str = datadir.to_str().with_context(|| {
+            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
+        })?;
+        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        args.push(Cow::Borrowed("--init"));
 
+        let init_output = Command::new(&self.env.pageserver_bin())
+            .args(args.iter().map(Cow::as_ref))
+            .envs(self.pageserver_env_variables()?)
+            .output()
+            .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
+
+        anyhow::ensure!(
+            init_output.status.success(),
+            "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
+            node_id,
+            String::from_utf8_lossy(&init_output.stdout),
+            String::from_utf8_lossy(&init_output.stderr),
+        );
+
+        Ok(())
+    }
+
+    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
+        let mut overrides = self.neon_local_overrides();
+        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+
+        let datadir = self.repo_path();
+        print!(
+            "Starting pageserver node {} at '{}' in {:?}",
+            self.env.pageserver.id,
+            self.pg_connection_config.raw_address(),
+            datadir
+        );
+        io::stdout().flush()?;
+
+        let datadir_path_str = datadir.to_str().with_context(|| {
+            format!(
+                "Cannot start pageserver node {} in path that has no string representation: {:?}",
+                self.env.pageserver.id, datadir,
+            )
+        })?;
+        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
         if update_config {
-            args.push("--update-config");
+            args.push(Cow::Borrowed("--update-config"));
         }
 
-        for config_override in &overrides {
-            args.extend(["-c", config_override]);
-        }
-
-        let envs = if self.env.pageserver.auth_type != AuthType::Trust {
-            // Generate a token to connect from the pageserver to a safekeeper
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
-        } else {
-            vec![]
-        };
         background_process::start_process(
             "pageserver",
-            datadir,
+            &datadir,
             &self.env.pageserver_bin(),
-            &args,
-            envs,
+            args.iter().map(Cow::as_ref),
+            self.pageserver_env_variables()?,
             background_process::InitialPidFile::Expect(&self.pid_file()),
             || match self.check_status() {
                 Ok(()) => Ok(true),
@@ -259,6 +297,35 @@ impl PageServerNode {
         )
     }
 
+    fn pageserver_basic_args<'a>(
+        &self,
+        config_overrides: &'a [&'a str],
+        datadir_path_str: &'a str,
+    ) -> Vec<Cow<'a, str>> {
+        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
+
+        let mut overrides = self.neon_local_overrides();
+        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+        for config_override in overrides {
+            args.push(Cow::Borrowed("-c"));
+            args.push(Cow::Owned(config_override));
+        }
+
+        args
+    }
+
+    fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
+        Ok(if self.env.pageserver.auth_type != AuthType::Trust {
+            // Generate a token to connect from the pageserver to a safekeeper
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
+            vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
+        } else {
+            Vec::new()
+        })
+    }
+
     ///
     /// Stop the server.
     ///
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 3bbffd6941..28858fcbab 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -272,7 +272,7 @@ impl Debug for S3Config {
 }
 
 impl RemoteStorageConfig {
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
         let local_path = toml.get("local_path");
         let bucket_name = toml.get("bucket_name");
         let bucket_region = toml.get("bucket_region");
@@ -296,7 +296,8 @@ impl RemoteStorageConfig {
         .context("Failed to parse 'concurrency_limit' as a positive integer")?;
 
         let storage = match (local_path, bucket_name, bucket_region) {
-            (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
+            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
+            (None, None, None) => return Ok(None),
             (_, Some(_), None) => {
                 bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
             }
@@ -322,11 +323,11 @@ impl RemoteStorageConfig {
             (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
         };
 
-        Ok(RemoteStorageConfig {
+        Ok(Some(RemoteStorageConfig {
             max_concurrent_syncs,
             max_sync_errors,
             storage,
-        })
+        }))
     }
 }
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c07907a1c9..48e9f32276 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -524,7 +524,7 @@ impl PageServerConf {
                 )),
                 "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
                 "remote_storage" => {
-                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?))
+                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                 }
                 "tenant_config" => {
                     t_conf = Self::parse_toml_tenant_conf(item)?;
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 92cd5db203..cab5053b5b 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -318,12 +318,15 @@ fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
 }
 
 // Parse RemoteStorage from TOML table.
-fn parse_remote_storage(storage_conf: &str) -> Result<RemoteStorageConfig> {
+fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfig> {
     // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
-    let storage_conf_toml = format!("remote_storage = {}", storage_conf);
+    let storage_conf_toml = format!("remote_storage = {storage_conf}");
     let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
     let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
-    RemoteStorageConfig::from_toml(storage_conf_parsed_toml)
+    RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| {
+        // XXX: Don't print the original toml here, there might be some sensitive data
+        parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config")
+    })
 }
 
 #[test]

From 4603a4cbb574c07a45b5e1caa8b46b5c11a2cf54 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Tue, 13 Dec 2022 10:43:30 +0100
Subject: [PATCH 063/167] Bypass SENTRY_ENVIRONMENT variable in order to filter
 panics in sentry by environment.

---
 .github/ansible/neon-stress.hosts.yaml                         | 1 +
 .github/ansible/prod.ap-southeast-1.hosts.yaml                 | 1 +
 .github/ansible/prod.eu-central-1.hosts.yaml                   | 1 +
 .github/ansible/prod.us-east-2.hosts.yaml                      | 1 +
 .github/ansible/production.hosts.yaml                          | 1 +
 .github/ansible/staging.eu-west-1.hosts.yaml                   | 1 +
 .github/ansible/staging.hosts.yaml                             | 1 +
 .github/ansible/staging.us-east-2.hosts.yaml                   | 1 +
 .github/ansible/systemd/pageserver.service                     | 2 +-
 .github/ansible/systemd/safekeeper.service                     | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml   | 1 +
 .../helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml    | 2 ++
 .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml    | 1 +
 .../dev-us-east-2-beta.neon-proxy-scram-legacy.yaml            | 1 +
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml   | 1 +
 .../helm-values/dev-us-east-2-beta.neon-storage-broker.yaml    | 2 ++
 .github/helm-values/neon-stress.neon-storage-broker.yaml       | 2 ++
 .github/helm-values/neon-stress.proxy-scram.yaml               | 3 +++
 .github/helm-values/neon-stress.proxy.yaml                     | 1 +
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml          | 1 +
 .../prod-ap-southeast-1-epsilon.neon-storage-broker.yaml       | 2 ++
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml  | 1 +
 .../prod-eu-central-1-gamma.neon-storage-broker.yaml           | 2 ++
 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 1 +
 .../helm-values/prod-us-east-2-delta.neon-storage-broker.yaml  | 2 ++
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml   | 1 +
 .../helm-values/prod-us-west-2-eta.neon-storage-broker.yaml    | 2 ++
 .github/helm-values/production.neon-storage-broker.yaml        | 2 ++
 .github/helm-values/production.proxy-scram.yaml                | 3 +++
 .github/helm-values/production.proxy.yaml                      | 1 +
 .github/helm-values/staging.neon-storage-broker.yaml           | 2 ++
 .github/helm-values/staging.proxy-scram.yaml                   | 1 +
 .github/helm-values/staging.proxy.yaml                         | 1 +
 libs/utils/src/sentry_init.rs                                  | 2 ++
 34 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml
index 6b2166e7a6..5d5df5a6d5 100644
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -14,6 +14,7 @@ storage:
     safekeeper_s3_prefix: neon-stress/wal
     hostname_suffix: ".local"
     remote_user: admin
+    sentry_environment: development
   children:
     pageservers:
       hosts:
diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 76ec3d29ae..bcc7bb3b16 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -16,6 +16,7 @@ storage:
     ansible_aws_ssm_region: ap-southeast-1
     ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
     console_region_id: aws-ap-southeast-1
+    sentry_environment: production
 
   children:
     pageservers:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index c8a8b15ddb..2b372d0fcb 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -16,6 +16,7 @@ storage:
     ansible_aws_ssm_region: eu-central-1
     ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
     console_region_id: aws-eu-central-1
+    sentry_environment: production
 
   children:
     pageservers:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 36a5337a8d..7a4002ec88 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -16,6 +16,7 @@ storage:
     ansible_aws_ssm_region: us-east-2
     ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
     console_region_id: aws-us-east-2
+    sentry_environment: production
 
   children:
     pageservers:
diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index cea0556ba1..d22c845966 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -14,6 +14,7 @@ storage:
     safekeeper_s3_prefix: prod-1/wal
     hostname_suffix: ".local"
     remote_user: admin
+    sentry_environment: production
 
   children:
     pageservers:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 4a64423a0d..90f00175b0 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -16,6 +16,7 @@ storage:
     ansible_aws_ssm_region: eu-west-1
     ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
     console_region_id: aws-eu-west-1
+    sentry_environment: development
 
   children:
     pageservers:
diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml
index a580b7563a..79acfd1d2a 100644
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -13,6 +13,7 @@ storage:
     safekeeper_s3_prefix: us-stage/wal
     hostname_suffix: ".local"
     remote_user: admin
+    sentry_environment: development
 
   children:
     pageservers:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 5a5a673a5e..d2b7fae12a 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -16,6 +16,7 @@ storage:
     ansible_aws_ssm_region: us-east-2
     ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
     console_region_id: aws-us-east-2
+    sentry_environment: development
 
   children:
     pageservers:
diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service
index 9847ee0f9e..4570a666fa 100644
--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=pageserver
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
 ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service
index 828655e435..d7d8d26b1a 100644
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=safekeeper
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
 ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index 0e98636057..f89eea5972 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
+  sentryEnvironment: "development"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index f139244cff..e876367a18 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -53,3 +53,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "development"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index 685cbd192d..eeb025277b 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
+  sentryEnvironment: "development"
 
 # -- Additional labels for neon-proxy-link pods
 podLabels:
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 76653e769c..ed710bc196 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
+  sentryEnvironment: "development"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index f2247fa4c1..ba0109c1eb 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
+  sentryEnvironment: "development"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index d59d2ebe70..dcf4b99de2 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -53,3 +53,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "development"
diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml
index fd35c5e14e..e11e5d4214 100644
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ b/.github/helm-values/neon-stress.neon-storage-broker.yaml
@@ -52,3 +52,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "development"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index 8f55d31c87..bbdd13781c 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -24,3 +24,6 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+settings:
+  sentryEnvironment: "development"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
index ce432ca23c..c3ecf6c743 100644
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -4,6 +4,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
   uri: "https://console.dev.neon.tech/psql_session/"
+  sentryEnvironment: "development"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index f90f89a516..a37a37406c 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
+  sentryEnvironment: "production"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 9654097934..0abc6ebaa1 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -53,3 +53,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "production"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 33a1154099..69d00a7e9c 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
+  sentryEnvironment: "production"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index 9582327df3..d44a3eab5c 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -53,3 +53,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "production"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 5f9f2d2e66..19d91fa4dc 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
+  sentryEnvironment: "production"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index 7c64d4c7bd..b9eeff5681 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -53,3 +53,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "production"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 1747cb95b1..f148188c48 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
+  sentryEnvironment: "production"
 
 # -- Additional labels for neon-proxy pods
 podLabels:
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 1014d36264..249f76303a 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -53,3 +53,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "production"
diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml
index 395b023671..aa64081da3 100644
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ b/.github/helm-values/production.neon-storage-broker.yaml
@@ -52,3 +52,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "production"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 54b0fbcd98..7011a9ce01 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -22,3 +22,6 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+settings:
+  sentryEnvironment: "production"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml
index c26a6258be..9db68c1044 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/production.proxy.yaml
@@ -2,6 +2,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
   uri: "https://console.neon.tech/psql_session/"
+  sentryEnvironment: "production"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml
index bffcf41ef0..6b21c286a1 100644
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ b/.github/helm-values/staging.neon-storage-broker.yaml
@@ -52,3 +52,5 @@ extraManifests:
         matchNames:
           - "{{ .Release.Namespace }}"
 
+settings:
+  sentryEnvironment: "development"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
index 91422e754a..f249df3612 100644
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
+  sentryEnvironment: "development"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
index 25842429a5..62b4c4a595 100644
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
+  sentryEnvironment: "development"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs
index 4f1c297854..992cb5c671 100644
--- a/libs/utils/src/sentry_init.rs
+++ b/libs/utils/src/sentry_init.rs
@@ -10,11 +10,13 @@ pub fn init_sentry(
     extra_options: &[(&str, &str)],
 ) -> Option<ClientInitGuard> {
     let dsn = env::var("SENTRY_DSN").ok()?;
+    let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
 
     let guard = sentry::init((
         dsn,
         sentry::ClientOptions {
             release: release_name,
+            environment: Some(environment.into()),
             ..Default::default()
         },
     ));

From feb07ed5101eb1dbcb36c3562bf81bb81ee972dd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 13 Dec 2022 14:01:29 +0000
Subject: [PATCH 064/167] deploy (old): replace actions/setup-python@v4 with
 ansible image (#3081)

Replace actions/setup-python@v4 with the ansible image to fix
```
Version 3.10 was not found in the local cache
Error: The version '3.10' with architecture 'x64' was not found for this operating system.
```
---
 .github/workflows/build_and_test.yml | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8707065ef2..678f9cbfce 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -752,7 +752,7 @@ jobs:
 
   deploy:
     runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -772,16 +772,6 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Setup python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-
-      - name: Setup ansible
-        run: |
-          export PATH="/root/.local/bin:$PATH"
-          pip install --progress-bar off --user ansible boto3 toml
-
       - name: Redeploy
         run: |
           export DOCKER_TAG=${{needs.tag.outputs.build-tag}}

From 0c915dcb1d919d6f2225cffdc48f200e7f447bc7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 13 Dec 2022 15:53:08 +0100
Subject: [PATCH 065/167] Timeline::download_missing: fix handling of
 mismatched layer size

Before this patch, when we decide to rename a layer file to backup
because of layer file size mismatch, we would not remove the layer from
the layer map, but remote the on-disk file.

Because we re-download the file immediately after, we simply end up with
two layer objects in memory that reference the same file in the layer
map. So, GetPage() would work fine until one of the layers gets
delete()'d. The other layer's delete() would then fail.

Future work: prevent insertion of the same layer at LayerMap level
so that we notice such bugs sooner.
---
 pageserver/src/tenant/timeline.rs | 73 +++++++++++++++++++------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9a4194d916..a746fd9bf8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1013,9 +1013,8 @@ impl Timeline {
         //       1) if there was another pageserver that came and generated new files
         //       2) during attach of a timeline with big history which we currently do not do
         let mut local_only_layers = local_layers;
-        let timeline_dir = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for remote_layer_name in &index_part.timeline_layers {
-            local_only_layers.remove(remote_layer_name);
+            let local_layer = local_only_layers.remove(remote_layer_name);
 
             let remote_layer_metadata = index_part
                 .layer_metadata
@@ -1023,41 +1022,57 @@ impl Timeline {
                 .map(LayerFileMetadata::from)
                 .unwrap_or(LayerFileMetadata::MISSING);
 
-            let local_layer_path = timeline_dir.join(remote_layer_name.file_name());
+            // Is the local layer's size different from the size stored in the
+            // remote index file? If so, rename_to_backup those files & remove
+            // local_layer form the layer map.
+            // We'll download a fresh copy of the layer file below.
+            if let Some(local_layer) = local_layer {
+                let local_layer_path = local_layer.local_path();
+                ensure!(
+                    local_layer_path.exists(),
+                    "every layer from local_layers must exist on disk: {}",
+                    local_layer_path.display()
+                );
 
-            if local_layer_path.exists() {
-                let mut already_downloaded = true;
-                // Are there any local files that exist, with a size that doesn't match
-                // with the size stored in the remote index file?
-                // If so, rename_to_backup those files so that we re-download them later.
                 if let Some(remote_size) = remote_layer_metadata.file_size() {
-                    match local_layer_path.metadata() {
-                        Ok(metadata) => {
-                            let local_size = metadata.len();
-
-                            if local_size != remote_size {
-                                warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-                                if let Err(err) = rename_to_backup(&local_layer_path) {
-                                    error!("could not rename file {local_layer_path:?}: {err:?}");
-                                } else {
-                                    self.metrics.current_physical_size_gauge.sub(local_size);
-                                    already_downloaded = false;
-                                }
-                            }
-                        }
-                        Err(err) => {
-                            error!("could not get size of local file {local_layer_path:?}: {err:?}")
+                    let metadata = local_layer_path.metadata().with_context(|| {
+                        format!(
+                            "get file size of local layer {}",
+                            local_layer_path.display()
+                        )
+                    })?;
+                    let local_size = metadata.len();
+                    if local_size != remote_size {
+                        warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+                        if let Err(err) = rename_to_backup(&local_layer_path) {
+                            assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
+                            anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
+                        } else {
+                            self.metrics.current_physical_size_gauge.sub(local_size);
+                            self.layers.write().unwrap().remove_historic(local_layer);
+                            // fall-through to adding the remote layer
                         }
+                    } else {
+                        debug!(
+                            "layer is present locally and file size matches remote, using it: {}",
+                            local_layer_path.display()
+                        );
+                        continue;
                     }
-                }
-
-                if already_downloaded {
+                } else {
+                    debug!(
+                        "layer is present locally and remote does not have file size, using it: {}",
+                        local_layer_path.display()
+                    );
                     continue;
                 }
-            } else {
-                info!("remote layer {remote_layer_name:?} does not exist locally");
             }
 
+            info!(
+                "remote layer does not exist locally, downloading it now: {}",
+                remote_layer_name.file_name()
+            );
+
             match remote_layer_name {
                 LayerFileName::Image(imgfilename) => {
                     if imgfilename.lsn > up_to_date_disk_consistent_lsn {

From 0bc488b7234924c2a4e505374710a79cd67a9725 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Tue, 13 Dec 2022 15:12:04 +0100
Subject: [PATCH 066/167] Add sentry environment for pageserver and safekeepers
 in new region (us-west-2)

---
 .github/ansible/prod.us-west-2.hosts.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index d5ef761cd5..c59334b649 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -16,6 +16,7 @@ storage:
     ansible_aws_ssm_region: us-west-2
     ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2
     console_region_id: aws-us-west-2-new
+    sentry_environment: production
 
   children:
     pageservers:

From b39d6126bba388a9259f80d236167514dc1726d3 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 13 Dec 2022 19:57:39 +0100
Subject: [PATCH 067/167] Force ansible to use local ansible.cfg (#3089)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 678f9cbfce..b5941fe218 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -793,7 +793,7 @@ jobs:
           ssh-add ssh-key
           rm -f ssh-key ssh-key-cert.pub
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-new:

From 826214ae565622f896de32c8124e2bc0f3bcec74 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 13 Dec 2022 21:06:18 +0100
Subject: [PATCH 068/167] Force ansible-galaxy to also use local ansible.cfg
 (#3091)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b5941fe218..7fcc7671a9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -792,7 +792,7 @@ jobs:
           chmod 0600 ssh-key
           ssh-add ssh-key
           rm -f ssh-key ssh-key-cert.pub
-          ansible-galaxy collection install sivel.toiletwater
+          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
           ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 

From 228f9e4322a4dd12a470f1ab0a8d3d9d7e5b8215 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 13 Dec 2022 23:59:49 +0100
Subject: [PATCH 069/167] Use default folder for ansible collections (#3092)

---
 .github/ansible/ansible.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/ansible/ansible.cfg b/.github/ansible/ansible.cfg
index 0497ee401d..5818a64455 100644
--- a/.github/ansible/ansible.cfg
+++ b/.github/ansible/ansible.cfg
@@ -3,7 +3,6 @@
 localhost_warning = False
 host_key_checking = False
 timeout = 30
-collections_paths = ./collections
 
 [ssh_connection]
 ssh_args   = -F ./ansible.ssh.cfg

From c819b699be9782b26c687b83d65ee79eaf4605c9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 13 Dec 2022 23:02:45 +0000
Subject: [PATCH 070/167] Nightly Benchmark: run neon-captest-reuse from
 staging (#3086)

The project has been migrated (now it is `restless-king-632302`), and
now we should run tests from staging runners.

Test run:
https://github.com/neondatabase/neon/actions/runs/3686865543/jobs/6241367161

Ref https://github.com/neondatabase/cloud/issues/2836
---
 .github/workflows/benchmarking.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index a2c05a9222..e3e0f1e820 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -115,13 +115,10 @@ jobs:
         # neon-captest-prefetch: Same, with prefetching enabled (new project)
         # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
         # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
         db_size: [ 10gb ]
         runner: [ us-east-2 ]
         include:
-          - platform: neon-captest-reuse
-            db_size: 10gb
-            runner: dev  # TODO: Switch to us-east-2 after dry-bonus-223539 migration to staging
           - platform: neon-captest-new
             db_size: 50gb
             runner: us-east-2

From 827ee10b5a58f3832a1dd0f2599dcef48c2fb548 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 14 Dec 2022 01:51:42 +0100
Subject: [PATCH 071/167] Disable neon-stress deploy (#3093)

---
 .github/workflows/build_and_test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7fcc7671a9..7a887cbece 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -740,8 +740,7 @@ jobs:
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
             STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
-            echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
+            echo "include=[$STAGING]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT

From f8ab5ef3b51eb7f37ccfa24ec232153424136088 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 14 Dec 2022 14:58:12 +0300
Subject: [PATCH 072/167] Update broker endpoint for prod-us-west-2. (#3095)

---
 .github/ansible/prod.us-west-2.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index c59334b649..682ee5994d 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
     console_mgmt_base_url: http://console-release.local
-    etcd_endpoints: etcd-0.us-west-2.aws.neon.tech:2379
+    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:

From ada5b7158fadc5136a017a680b0dc4a97157a212 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 14 Dec 2022 14:09:16 +0200
Subject: [PATCH 073/167] Fix Issue #3014  (#3059)

* TenantConfigRequest now supports tenant_id as hex string input instead
of bytes array

* Config file is truncated in each creation/update
---
 libs/pageserver_api/src/models.rs       |  1 +
 pageserver/src/tenant.rs                |  7 +++++--
 test_runner/regress/test_tenant_conf.py | 25 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index cf3252a9ae..e49b7051d2 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -117,6 +117,7 @@ impl TenantCreateRequest {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TenantConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
     pub tenant_id: TenantId,
     #[serde(default)]
     #[serde_as(as = "Option<DisplayFromStr>")]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 80b65d281f..4fcb1e3ba3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1690,7 +1690,7 @@ impl Tenant {
         let _enter = info_span!("saving tenantconf").entered();
         info!("persisting tenantconf to {}", target_config_path.display());
 
-        // TODO this will prepend comments endlessly
+        // TODO this will prepend comments endlessly ?
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
 
@@ -1703,7 +1703,10 @@ impl Tenant {
 
         let mut target_config_file = VirtualFile::open_with_options(
             target_config_path,
-            OpenOptions::new().write(true).create_new(first_save),
+            OpenOptions::new()
+                .truncate(true) // This needed for overwriting with small config files
+                .write(true)
+                .create_new(first_save),
         )?;
 
         target_config_file
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 46a945a58b..6d621fbb77 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -133,3 +133,28 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "pitr_interval": 2592000,
                 }.items()
             )
+
+    # update the config with very short config and make sure no trailing chars are left from previous config
+    env.neon_cli.config_tenant(
+        tenant_id=tenant,
+        conf={
+            "pitr_interval": "1 min",
+        },
+    )
+
+    # restart the pageserver and ensure that the config is still correct
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    with closing(env.pageserver.connect()) as psconn:
+        with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur:
+            pscur.execute(f"show {tenant}")
+            res = pscur.fetchone()
+            log.info(f"after restart res: {res}")
+            assert all(
+                i in res.items()
+                for i in {
+                    "compaction_period": 20,
+                    "pitr_interval": 60,
+                }.items()
+            )

From d3787f9b47e4c3dfb6fc159912b78b1ab36dd096 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Dec 2022 13:04:04 +0000
Subject: [PATCH 074/167] neon-project-create/delete: print project id to
 stdout (#3073)

Print project_id to GitHub Actions stdout
---
 .github/actions/neon-project-create/action.yml | 2 ++
 .github/actions/neon-project-delete/action.yml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index b9d9182882..0480bfbc84 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -55,6 +55,8 @@ runs:
 
         project_id=$(echo $project | jq --raw-output '.project.id')
         echo "project_id=${project_id}" >> $GITHUB_OUTPUT
+
+        echo "Project ${project_id} has been created"
       env:
         API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index cd58c629e5..adc8510a34 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -27,6 +27,8 @@ runs:
           --header "Accept: application/json" \
           --header "Content-Type: application/json" \
           --header "Authorization: Bearer ${API_KEY}"
+
+        echo "Project ${PROJECT_ID} has been deleted"
       env:
         API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}

From 4d201619edcdb73fcc3aecd62681c317e499be00 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 14 Dec 2022 15:09:08 +0200
Subject: [PATCH 075/167] Remove large database files after every test suite
 (#3090)

Closes https://github.com/neondatabase/neon/issues/1984
Closes https://github.com/neondatabase/neon/pull/2830

A follow-up of https://github.com/neondatabase/neon/pull/2830, I've
noticed that benchmarks failed again due to out of space issues.

Removes most of the pageserver and safekeeper files from disk after
every pytest suite run.

```
$ poetry run pytest -vvsk "test_tenant_redownloads_truncated_file_on_startup[local_fs]"
# ...
$ du -h test_output/test_tenant_redownloads_truncated_file_on_startup\[local_fs\]
# ...
104K    test_output/test_tenant_redownloads_truncated_file_on_startup[local_fs]

$ poetry run pytest -vvsk "test_tenant_redownloads_truncated_file_on_startup[local_fs]" --preserve-database-files
# ...
$ du -h test_output/test_tenant_redownloads_truncated_file_on_startup\[local_fs\]
# ...
123M    test_output/test_tenant_redownloads_truncated_file_on_startup[local_fs]
```

Co-authored-by: Bojan Serafimov <bojan.serafimov7@gmail.com>
---
 test_runner/README.md                     |  8 ++-
 test_runner/fixtures/neon_fixtures.py     | 68 ++++++++++++++++++++++-
 test_runner/performance/README.md         |  1 +
 test_runner/regress/test_compatibility.py |  2 +
 4 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index bbb8532b52..877498bae7 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -76,9 +76,15 @@ Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
 `NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as
-`--pageserver-config-override=${value}` parameter values when neon_local cli is invoked
 `RUST_LOG`: logging configuration to pass into Neon CLI
 
+Useful parameters and commands:
+
+`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli
+
+`--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk
+after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents.
+
 Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
 `./scripts/pytest -s --log-cli-level=INFO ...`
 (Note many tests capture subprocess outputs separately, so this may not
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e3f8247274..818853a4ac 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -30,10 +30,17 @@ import psycopg2
 import pytest
 import requests
 from _pytest.config import Config
+from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from fixtures.log_helper import log
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import Fn, allure_attach_from_dir, get_self_dir, subprocess_capture
+from fixtures.utils import (
+    ATTACHMENT_NAME_REGEX,
+    Fn,
+    allure_attach_from_dir,
+    get_self_dir,
+    subprocess_capture,
+)
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -590,6 +597,7 @@ class NeonEnvBuilder:
         auth_enabled: bool = False,
         rust_log_override: Optional[str] = None,
         default_branch_name: str = DEFAULT_BRANCH_NAME,
+        preserve_database_files: bool = False,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -611,6 +619,7 @@ class NeonEnvBuilder:
         self.neon_binpath = neon_binpath
         self.pg_distrib_dir = pg_distrib_dir
         self.pg_version = pg_version
+        self.preserve_database_files = preserve_database_files
 
     def init(self) -> NeonEnv:
         # Cannot create more than one environment from one builder
@@ -718,6 +727,28 @@ class NeonEnvBuilder:
             prefix_in_bucket=self.remote_storage_prefix,
         )
 
+    def cleanup_local_storage(self):
+        if self.preserve_database_files:
+            return
+
+        directories_to_clean: List[Path] = []
+        for test_entry in Path(self.repo_dir).glob("**/*"):
+            if test_entry.is_file():
+                test_file = test_entry
+                if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name):
+                    continue
+                if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
+                    continue
+                log.debug(f"Removing large database {test_file} file")
+                test_file.unlink()
+            elif test_entry.is_dir():
+                directories_to_clean.append(test_entry)
+
+        for directory_to_clean in reversed(directories_to_clean):
+            if not os.listdir(directory_to_clean):
+                log.debug(f"Removing empty directory {directory_to_clean}")
+                directory_to_clean.rmdir()
+
     def cleanup_remote_storage(self):
         # here wee check for true remote storage, no the local one
         # local cleanup is not needed after test because in ci all env will be destroyed anyway
@@ -783,7 +814,22 @@ class NeonEnvBuilder:
                 sk.stop(immediate=True)
             self.env.pageserver.stop(immediate=True)
 
-            self.cleanup_remote_storage()
+            cleanup_error = None
+            try:
+                self.cleanup_remote_storage()
+            except Exception as e:
+                log.error(f"Error during remote storage cleanup: {e}")
+                cleanup_error = e
+
+            try:
+                self.cleanup_local_storage()
+            except Exception as e:
+                log.error(f"Error during local storage cleanup: {e}")
+                if cleanup_error is not None:
+                    cleanup_error = e
+
+            if cleanup_error is not None:
+                raise cleanup_error
 
             self.env.pageserver.assert_no_errors()
 
@@ -949,6 +995,7 @@ class NeonEnv:
 @pytest.fixture(scope=shareable_scope)
 def _shared_simple_env(
     request: FixtureRequest,
+    pytestconfig: Config,
     port_distributor: PortDistributor,
     mock_s3_server: MockS3Server,
     default_broker: NeonBroker,
@@ -980,6 +1027,7 @@ def _shared_simple_env(
         pg_distrib_dir=pg_distrib_dir,
         pg_version=pg_version,
         run_id=run_id,
+        preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
     ) as builder:
         env = builder.init_start()
 
@@ -1006,6 +1054,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
 
 @pytest.fixture(scope="function")
 def neon_env_builder(
+    pytestconfig: Config,
     test_output_dir: str,
     port_distributor: PortDistributor,
     mock_s3_server: MockS3Server,
@@ -1041,6 +1090,7 @@ def neon_env_builder(
         pg_version=pg_version,
         broker=default_broker,
         run_id=run_id,
+        preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
     ) as builder:
         yield builder
 
@@ -2735,6 +2785,20 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
+def pytest_addoption(parser: Parser):
+    parser.addoption(
+        "--preserve-database-files",
+        action="store_true",
+        default=False,
+        help="Preserve timeline files after the test suite is over",
+    )
+
+
+SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
+    r"config|metadata|.+\.(?:toml|pid|json|sql)"
+)
+
+
 # This is autouse, so the test output directory always gets created, even
 # if a test doesn't put anything there. It also solves a problem with the
 # neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 725612853a..a32ce87c33 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -16,6 +16,7 @@ Some handy pytest flags for local development:
 - `-s` shows test output
 - `-k` selects a test to run
 - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
+- `--cleanup-test-ouput` cleans up after each test
 
 # What performance tests do we have and how we run them
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index e2822427e9..332e2f2519 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -47,6 +47,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
     neon_env_builder.pg_version = "14"
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_local_fs_remote_storage()
+    neon_env_builder.preserve_database_files = True
 
     env = neon_env_builder.init_start()
     pg = env.postgres.create_start("main")
@@ -331,6 +332,7 @@ def check_neon_works(
     config.initial_tenant = snapshot_config["default_tenant_id"]
     config.neon_binpath = neon_binpath
     config.pg_distrib_dir = pg_distrib_dir
+    config.preserve_database_files = True
 
     cli = NeonCli(config)
     cli.raw_cli(["start"])

From 62f6e969e78b76fa434ccdc194d939f461e3b40d Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Wed, 14 Dec 2022 16:33:51 +0100
Subject: [PATCH 076/167] Fix helm value for proxy

---
 .github/helm-values/neon-stress.proxy-scram.yaml | 4 +---
 .github/helm-values/production.proxy-scram.yaml  | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index bbdd13781c..dea47304a0 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -4,6 +4,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-stress-console.local/management/api/v2"
   domain: "*.stress.neon.tech"
+  sentryEnvironment: "development"
 
 podLabels:
   zenith_service: proxy-scram
@@ -24,6 +25,3 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 7011a9ce01..399bc6d21b 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -2,6 +2,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.cloud.neon.tech"
+  sentryEnvironment: "production"
 
 podLabels:
   zenith_service: proxy-scram
@@ -22,6 +23,3 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
-
-settings:
-  sentryEnvironment: "production"

From df09d0375b153d68ccdef538327ce34c9894ce89 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 14 Dec 2022 17:24:53 +0200
Subject: [PATCH 077/167] ignore metadata_backup files in index_part

---
 pageserver/src/storage_sync2/index.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index ce9a43ed3b..82487339ee 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -135,7 +135,7 @@ impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
         match maybe_clean {
             Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
             Err(e) => {
-                if v.ends_with(".old") {
+                if v.ends_with(".old") || v == "metadata_backup" {
                     Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
                 } else {
                     Err(E::custom(e))

From 8fcba150db3cdd0de74837896bf76c0317b046d5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Dec 2022 18:05:05 +0000
Subject: [PATCH 078/167] test_seqscans: temporarily disable remote test
 (#3101)

Temporarily disable `test_seqscans` for remote projects; they acquire
too much space and time. We can try to reenable it back after switching
to per-test projects.
---
 test_runner/performance/test_seqscans.py              | 11 ++++++-----
 test_runner/regress/test_old_request_lsn.py           |  2 +-
 test_runner/regress/test_pageserver_restart.py        |  4 ++--
 .../regress/test_walredo_not_left_behind_on_detach.py |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py
index a61d64553d..bd84724405 100644
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -22,15 +22,16 @@ from pytest_lazyfixture import lazy_fixture  # type: ignore
     ],
 )
 @pytest.mark.parametrize(
-    "env, scale",
+    "env,scale",
     [
         # Run on all envs. Use 200x larger table on remote cluster to make sure
         # it doesn't fit in shared buffers, which are larger on remote than local.
         pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
         pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
-        pytest.param(
-            lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
-        ),
+        # Reenable after switching per-test projects created via API
+        # pytest.param(
+        #     lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
+        # ),
     ],
 )
 def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int):
@@ -45,7 +46,7 @@ def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: in
             # Verify that the table is larger than shared_buffers
             cur.execute(
                 """
-            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_size
             from pg_settings where name = 'shared_buffers'
             """
             )
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 3e387bb6cc..1e81d8ba60 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -45,7 +45,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # will cause GetPage requests.
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index e48815906b..6388e979e5 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -32,7 +32,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     # Verify that the table is larger than shared_buffers
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
@@ -115,7 +115,7 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
             # Verify that the table is larger than shared_buffers
             cur.execute(
                 """
-            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
             from pg_settings where name = 'shared_buffers'
             """
             )
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index aaaa8893a5..24045e2eb7 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -65,7 +65,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     # Verify that the table is larger than shared_buffers
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )

From 4132ae9dfeefbdcca27f79a04779db8f7f6d164f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 13 Dec 2022 12:23:47 -0500
Subject: [PATCH 079/167] always remove RemoteTimelineClient's metrics when
 dropping it

---
 pageserver/src/metrics.rs                  | 108 ++++++++++++++++++---
 pageserver/src/storage_sync2.rs            |  25 +++--
 test_runner/fixtures/metrics.py            |   8 ++
 test_runner/regress/test_remote_storage.py |   7 +-
 test_runner/regress/test_tenants.py        |  26 ++++-
 5 files changed, 144 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 454ff01f0e..2f1a98e4c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -201,7 +201,7 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
 
 // remote storage metrics
 
-pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
+static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_remote_upload_queue_unfinished_tasks",
         "Number of tasks in the upload queue that are not finished yet.",
@@ -210,14 +210,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
     Download,
     Delete,
 }
 impl RemoteOpKind {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Upload => "upload",
             Self::Download => "download",
@@ -226,13 +226,13 @@ impl RemoteOpKind {
     }
 }
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 pub enum RemoteOpFileKind {
     Layer,
     Index,
 }
 impl RemoteOpFileKind {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Layer => "layer",
             Self::Index => "index",
@@ -491,10 +491,94 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
 
 use futures::Future;
 use pin_project_lite::pin_project;
+use std::collections::HashMap;
 use std::pin::Pin;
+use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::Instant;
 
+pub struct RemoteTimelineClientMetrics {
+    tenant_id: String,
+    timeline_id: String,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
+    unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+}
+
+impl RemoteTimelineClientMetrics {
+    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        RemoteTimelineClientMetrics {
+            tenant_id: tenant_id.to_string(),
+            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
+            unfinished_tasks: Mutex::new(HashMap::default()),
+        }
+    }
+    pub fn remote_operation_time(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        status: &'static str,
+    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.remote_operation_time.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str(), status);
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+    pub fn unfinished_tasks(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntGauge {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.unfinished_tasks.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+impl Drop for RemoteTimelineClientMetrics {
+    fn drop(&mut self) {
+        let RemoteTimelineClientMetrics {
+            tenant_id,
+            timeline_id,
+            remote_operation_time,
+            unfinished_tasks,
+        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
+        for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
+            let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+    }
+}
+
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
 pub trait MeasureRemoteOp: Sized {
@@ -504,6 +588,7 @@ pub trait MeasureRemoteOp: Sized {
         timeline_id: TimelineId,
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
+        metrics: Arc<RemoteTimelineClientMetrics>,
     ) -> MeasuredRemoteOp<Self> {
         let start = Instant::now();
         MeasuredRemoteOp {
@@ -513,6 +598,7 @@ pub trait MeasureRemoteOp: Sized {
             file_kind,
             op,
             start,
+            metrics,
         }
     }
 }
@@ -529,6 +615,7 @@ pin_project! {
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
         start: Instant,
+        metrics: Arc<RemoteTimelineClientMetrics>,
     }
 }
 
@@ -541,15 +628,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
         if let Poll::Ready(ref res) = poll_result {
             let duration = this.start.elapsed();
             let status = if res.is_ok() { &"success" } else { &"failure" };
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &this.tenant_id.to_string(),
-                    &this.timeline_id.to_string(),
-                    this.file_kind.as_str(),
-                    this.op.as_str(),
-                    status,
-                ])
-                .unwrap()
+            this.metrics
+                .remote_operation_time(this.file_kind, this.op, status)
                 .observe(duration.as_secs_f64());
         }
         poll_result
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 7cc0eac2bf..cebec4d615 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -210,10 +210,9 @@ use utils::lsn::Lsn;
 
 use self::index::IndexPart;
 
-use crate::metrics::MeasureRemoteOp;
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
-use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
+use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
 use crate::tenant::filename::LayerFileName;
 use crate::{
     config::PageServerConf,
@@ -256,6 +255,8 @@ pub struct RemoteTimelineClient {
 
     upload_queue: Mutex<UploadQueue>,
 
+    metrics: Arc<RemoteTimelineClientMetrics>,
+
     storage_impl: GenericRemoteStorage,
 }
 
@@ -501,6 +502,7 @@ impl RemoteTimelineClient {
             self.timeline_id,
             RemoteOpFileKind::Index,
             RemoteOpKind::Download,
+            Arc::clone(&self.metrics),
         )
         .await
     }
@@ -528,6 +530,7 @@ impl RemoteTimelineClient {
             self.timeline_id,
             RemoteOpFileKind::Layer,
             RemoteOpKind::Download,
+            Arc::clone(&self.metrics),
         )
         .await?;
 
@@ -847,6 +850,7 @@ impl RemoteTimelineClient {
                         self.timeline_id,
                         RemoteOpFileKind::Layer,
                         RemoteOpKind::Upload,
+                        Arc::clone(&self.metrics),
                     )
                     .await
                 }
@@ -863,6 +867,7 @@ impl RemoteTimelineClient {
                         self.timeline_id,
                         RemoteOpFileKind::Index,
                         RemoteOpKind::Upload,
+                        Arc::clone(&self.metrics),
                     )
                     .await
                 }
@@ -877,6 +882,7 @@ impl RemoteTimelineClient {
                             self.timeline_id,
                             *metric_file_kind,
                             RemoteOpKind::Delete,
+                            Arc::clone(&self.metrics),
                         )
                         .await
                 }
@@ -977,14 +983,8 @@ impl RemoteTimelineClient {
                 return;
             }
         };
-        REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
-            .get_metric_with_label_values(&[
-                &self.tenant_id.to_string(),
-                &self.timeline_id.to_string(),
-                file_kind.as_str(),
-                op_kind.as_str(),
-            ])
-            .unwrap()
+        self.metrics
+            .unfinished_tasks(&file_kind, &op_kind)
             .add(delta)
     }
 
@@ -1068,6 +1068,7 @@ pub fn create_remote_timeline_client(
         timeline_id,
         storage_impl: remote_storage,
         upload_queue: Mutex::new(UploadQueue::Uninitialized),
+        metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
     })
 }
 
@@ -1180,6 +1181,10 @@ mod tests {
             timeline_id: TIMELINE_ID,
             storage_impl,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &harness.tenant_id,
+                &TIMELINE_ID,
+            )),
         });
 
         let remote_timeline_dir =
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 86ab4425ed..17b2b71df2 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -39,6 +39,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
     return metrics
 
 
+PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
+    "pageserver_remote_upload_queue_unfinished_tasks",
+    "pageserver_remote_operation_seconds_bucket",
+    "pageserver_remote_operation_seconds_count",
+    "pageserver_remote_operation_seconds_sum",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_current_physical_size",
@@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 7152bc8b6a..d8f8298fa6 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -384,7 +384,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             metrics,
             re.MULTILINE,
         )
-        assert matches
+        if matches is None:
+            return None
         return int(matches[1])
 
     pg = env.postgres.create_start("main", tenant_id=tenant_id)
@@ -436,8 +437,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
 
     assert not timeline_path.exists()
 
-    # timeline deletion should kill ongoing uploads
-    assert get_queued_count(file_kind="index", op_kind="upload") == 0
+    # timeline deletion should kill ongoing uploads, so, the metric will be gone
+    assert get_queued_count(file_kind="index", op_kind="upload") is None
 
     # timeline deletion should be unblocking checkpoint ops
     checkpoint_thread.join(2.0)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 0b20afefc3..9477ae3c25 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -7,7 +7,11 @@ from typing import List
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics
+from fixtures.metrics import (
+    PAGESERVER_PER_TENANT_METRICS,
+    PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
+    parse_metrics,
+)
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         )
 
 
-def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "remote_storage_kind",
+    # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
+    [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
+)
+def test_pageserver_metrics_removed_after_detach(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
     """Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
 
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_pageserver_metrics_removed_after_detach",
+    )
+
     neon_env_builder.num_safekeepers = 3
 
     env = neon_env_builder.init_start()
@@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
 
     for tenant in [tenant_1, tenant_2]:
         pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
-        assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS)
+        expected = set(PAGESERVER_PER_TENANT_METRICS)
+        if remote_storage_kind == RemoteStorageKind.NOOP:
+            # if there's no remote storage configured, we don't expose the remote timeline client metrics
+            expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
+        assert pre_detach_samples == expected
 
         env.pageserver.http_client().tenant_detach(tenant)
 

From c04c201520c30f56700626365788bf2e28395040 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 14 Dec 2022 21:28:14 +0100
Subject: [PATCH 080/167] Push proxy metrics to Victoria Metrics (#3106)

---
 .../dev-eu-west-1-zeta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-link.yaml   | 25 +++++++++++++++++++
 ...s-east-2-beta.neon-proxy-scram-legacy.yaml | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/neon-stress.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/neon-stress.proxy.yaml    | 25 +++++++++++++++++++
 ...-southeast-1-epsilon.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...d-eu-central-1-gamma.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...prod-us-east-2-delta.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 .../prod-us-west-2-eta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/production.proxy-scram.yaml   | 25 +++++++++++++++++++
 .github/helm-values/production.proxy.yaml     | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy.yaml        | 25 +++++++++++++++++++
 14 files changed, 350 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index f89eea5972..ae9c1f2e40 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eeb025277b..093fac146a 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -38,3 +38,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index ed710bc196..a2f932e4fb 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index ba0109c1eb..1138536e94 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index dea47304a0..ed580349fc 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -25,3 +25,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
index c3ecf6c743..94270ced09 100644
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -34,3 +34,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a37a37406c..4e4aff1f9e 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 69d00a7e9c..94290a87e1 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 19d91fa4dc..1a4023708b 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index f148188c48..2942d6a2aa 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 399bc6d21b..c7143cd61a 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -23,3 +23,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml
index 9db68c1044..dbaf3cd096 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/production.proxy.yaml
@@ -32,3 +32,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
index f249df3612..66f9921c9a 100644
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
index 62b4c4a595..a22082e625 100644
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"

From bf3ac2be2d5cdff317ddd105dd68d13523382b19 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Dec 2022 08:53:54 -0500
Subject: [PATCH 081/167] add remote_physical_size metric

We do the accounting exclusively after updating remote IndexPart successfully.
This is cleaner & more robust than doing it upon completion of
individual layer file uploads / deletions since we can uset .set()
insteaf of add()/sub().

NB: Originally, this work was intended to be part of #3013 but it
turns out that it's completely orthogonal.
So, spin it out into this PR for easier review.
Since this change is additive, it won't break anything.
---
 pageserver/src/metrics.rs             | 30 +++++++++++++++++++++++++
 pageserver/src/storage_sync2.rs       | 32 +++++++++++++++++++++++++--
 pageserver/src/storage_sync2/index.rs |  2 +-
 test_runner/fixtures/metrics.py       |  1 +
 4 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 2f1a98e4c5..308f9cd4eb 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -96,6 +96,16 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_remote_physical_size",
+        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
+        // Corollary: If any files are missing from the index part, they won't be included here.
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_current_logical_size",
@@ -500,6 +510,7 @@ use std::time::Instant;
 pub struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
+    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
     remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
     unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
 }
@@ -511,8 +522,22 @@ impl RemoteTimelineClientMetrics {
             timeline_id: timeline_id.to_string(),
             remote_operation_time: Mutex::new(HashMap::default()),
             unfinished_tasks: Mutex::new(HashMap::default()),
+            remote_physical_size_gauge: Mutex::new(None),
         }
     }
+    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard
+            .get_or_insert_with(|| {
+                REMOTE_PHYSICAL_SIZE
+                    .get_metric_with_label_values(&[
+                        &self.tenant_id.to_string(),
+                        &self.timeline_id.to_string(),
+                    ])
+                    .unwrap()
+            })
+            .clone()
+    }
     pub fn remote_operation_time(
         &self,
         file_kind: &RemoteOpFileKind,
@@ -562,6 +587,7 @@ impl Drop for RemoteTimelineClientMetrics {
         let RemoteTimelineClientMetrics {
             tenant_id,
             timeline_id,
+            remote_physical_size_gauge,
             remote_operation_time,
             unfinished_tasks,
         } = self;
@@ -576,6 +602,10 @@ impl Drop for RemoteTimelineClientMetrics {
                 b,
             ]);
         }
+        {
+            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
     }
 }
 
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index cebec4d615..89bbc34227 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -460,6 +460,7 @@ impl RemoteTimelineClient {
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        self.update_remote_physical_size_gauge(Some(index_part));
         Ok(())
     }
 
@@ -471,6 +472,7 @@ impl RemoteTimelineClient {
     ) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_empty_remote(local_metadata)?;
+        self.update_remote_physical_size_gauge(None);
         Ok(())
     }
 
@@ -482,6 +484,20 @@ impl RemoteTimelineClient {
         }
     }
 
+    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
+        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
+            current_remote_index_part
+                .layer_metadata
+                .iter()
+                // If we don't have the file size for the layer, don't account for it in the metric.
+                .map(|(_, ilmd)| ilmd.file_size.unwrap_or(0))
+                .sum()
+        } else {
+            0
+        };
+        self.metrics.remote_physical_size_gauge().set(size);
+    }
+
     //
     // Download operations.
     //
@@ -543,6 +559,14 @@ impl RemoteTimelineClient {
             let upload_queue = guard.initialized_mut()?;
             if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
                 upgraded.merge(&new_metadata);
+                // If we don't do an index file upload inbetween here and restart,
+                // the value will go back down after pageserver restart, since we will
+                // have lost this data point.
+                // But, we upload index part fairly frequently, and restart pageserver rarely.
+                // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
+                self.metrics
+                    .remote_physical_size_gauge()
+                    .add(downloaded_size);
             } else {
                 // The file should exist, since we just downloaded it.
                 warn!(
@@ -855,7 +879,7 @@ impl RemoteTimelineClient {
                     .await
                 }
                 UploadOp::UploadMetadata(ref index_part, _lsn) => {
-                    upload::upload_index_part(
+                    let res = upload::upload_index_part(
                         self.conf,
                         &self.storage_impl,
                         self.tenant_id,
@@ -869,7 +893,11 @@ impl RemoteTimelineClient {
                         RemoteOpKind::Upload,
                         Arc::clone(&self.metrics),
                     )
-                    .await
+                    .await;
+                    if res.is_ok() {
+                        self.update_remote_physical_size_gauge(Some(index_part));
+                    }
+                    res
                 }
                 UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
                     let path = &self
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index 82487339ee..ed4ed10189 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -232,7 +232,7 @@ impl IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
-    file_size: Option<u64>,
+    pub(super) file_size: Option<u64>,
 }
 
 impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 17b2b71df2..5fe6c43528 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -44,6 +44,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_operation_seconds_bucket",
     "pageserver_remote_operation_seconds_count",
     "pageserver_remote_operation_seconds_sum",
+    "pageserver_remote_physical_size",
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (

From 10cd64cf8dd8b4280882ac3ba0d89182ac1b3b14 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Dec 2022 19:26:06 +0100
Subject: [PATCH 082/167] make TaskHandle::next_task_event cancellation-safe

If we get cancelled before jh.await returns we've take()n the join handle but
drop the result on the floor.

Fix it by setting self.join_handle = None after the .await

fixes https://github.com/neondatabase/neon/issues/3104
---
 pageserver/src/walreceiver.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs
index e627e9ecd0..74ede7c213 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -126,15 +126,21 @@ impl<E: Clone> TaskHandle<E> {
         match self.events_receiver.changed().await {
             Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
             Err(_task_channel_part_dropped) => {
-                TaskEvent::End(match self.join_handle.take() {
+                TaskEvent::End(match self.join_handle.as_mut() {
                     Some(jh) => {
                         if !jh.is_finished() {
                             warn!("sender is dropped while join handle is still alive");
                         }
 
-                        jh.await
+                        let res = jh
+                            .await
                             .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
-                            .and_then(|x| x)
+                            .and_then(|x| x);
+
+                        // For cancellation-safety, drop join_handle only after successful .await.
+                        self.join_handle = None;
+
+                        res
                     }
                     None => {
                         // Another option is to have an enum, join handle or result and give away the reference to it

From 397b60feabd132cfe4401e8b4f2c1cf11c25a71c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Dec 2022 10:27:26 -0500
Subject: [PATCH 083/167] common abstraction for waiting for SK commit_lsn to
 reach PS

---
 test_runner/fixtures/neon_fixtures.py         | 31 +++++++++++++++++++
 .../test_tenants_with_remote_storage.py       | 14 +++------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 818853a4ac..3a3ee94425 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3084,3 +3084,34 @@ def fork_at_current_lsn(
     """
     current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
     return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
+
+
+def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    safekeepers: List[Safekeeper],
+    pageserver: NeonPageserver,
+):
+    sk_commit_lsns = [
+        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers
+    ]
+    lsn = max(sk_commit_lsns)
+    ps_http = pageserver.http_client()
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn)
+    return lsn
+
+
+def wait_for_sk_commit_lsn_to_reach_remote_storage(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    safekeepers: List[Safekeeper],
+    pageserver: NeonPageserver,
+):
+    lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
+        tenant_id, timeline_id, safekeepers, pageserver
+    )
+    ps_http = pageserver.http_client()
+    # force a checkpoint to trigger upload
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
+    return lsn
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index afc413f3e3..57aaa70559 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -24,6 +24,7 @@ from fixtures.neon_fixtures import (
     assert_no_in_progress_downloads_for_tenant,
     available_remote_storages,
     wait_for_last_record_lsn,
+    wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -161,16 +162,9 @@ def test_tenants_attached_after_download(
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
     env.postgres.stop_all()
 
-    sk_commit_lsns = [
-        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn
-        for sk in env.safekeepers
-    ]
-    log.info("wait for pageserver to process all the WAL")
-    wait_for_last_record_lsn(client, tenant_id, timeline_id, max(sk_commit_lsns))
-    log.info("wait for it to reach remote storage")
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(client, tenant_id, timeline_id, max(sk_commit_lsns))
-    log.info("latest safekeeper_commit_lsn reached remote storage")
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
 
     detail_before = client.timeline_detail(
         tenant_id, timeline_id, include_non_incremental_physical_size=True

From 807b110946ee603aa64b363b6041f75edd822f97 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 15 Dec 2022 18:06:17 +0100
Subject: [PATCH 084/167] Update Makefile configuration: (#3011)

- Use only one templated section for most postgres-versioned steps
- Clean up neon_walredo, too, when running neon-pg-ext-clean
- Depend on the various cleanup steps for `clean` instead of manually
executing those cleanup steps.
---
 Makefile | 199 +++++++++++++++++++++++--------------------------------
 1 file changed, 84 insertions(+), 115 deletions(-)

diff --git a/Makefile b/Makefile
index 4711dc1c7d..92a4532684 100644
--- a/Makefile
+++ b/Makefile
@@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-v14-headers postgres-v15-headers
+neon: postgres-headers
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
 
 ### PostgreSQL parts
-# The rules are duplicated for Postgres v14 and 15. We may want to refactor
+# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
-$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
-	+@echo "Configuring Postgres v14 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+	+@echo "Configuring Postgres $* build"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
+	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
 		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
-
-$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
-	+@echo "Configuring Postgres v15 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
-		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
 
 # nicer alias to run 'configure'
-.PHONY: postgres-v14-configure
-postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
-
-.PHONY: postgres-v15-configure
-postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+# Note: I've been unable to use templates for this part of our configuration.
+# I'm not sure why it wouldn't work, but this is the only place (apart from
+# the "build-all-versions" entry points) where direct mention of PostgreSQL
+# versions is used.
+.PHONY: postgres-configure-v15
+postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+.PHONY: postgres-configure-v14
+postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
 
 # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
-.PHONY: postgres-v14-headers
-postgres-v14-headers: postgres-v14-configure
-	+@echo "Installing PostgreSQL v14 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install
-
-.PHONY: postgres-v15-headers
-postgres-v15-headers: postgres-v15-configure
-	+@echo "Installing PostgreSQL v15 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
+.PHONY: postgres-headers-%
+postgres-headers-%: postgres-configure-%
+	+@echo "Installing PostgreSQL $* headers"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install
 
 # Compile and install PostgreSQL
-.PHONY: postgres-v14
-postgres-v14: postgres-v14-configure \
-		  postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
-	+@echo "Compiling libpq v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
+.PHONY: postgres-%
+postgres-%: postgres-configure-% \
+		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+	+@echo "Compiling PostgreSQL $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
+	+@echo "Compiling libpq $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
+	+@echo "Compiling pg_buffercache $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 
-.PHONY: postgres-v15
-postgres-v15: postgres-v15-configure \
-		  postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
-	+@echo "Compiling libpq v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
+.PHONY: postgres-clean-%
+postgres-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
 
-# shorthand to build all Postgres versions
-postgres: postgres-v14 postgres-v15
+.PHONY: neon-pg-ext-%
+neon-pg-ext-%: postgres-%
+	+@echo "Compiling neon $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+	+@echo "Compiling neon_walredo $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_test_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
 
-.PHONY: postgres-v14-clean
-postgres-v14-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
+.PHONY: neon-pg-ext-clean-%
+neon-pg-ext-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
 
-.PHONY: postgres-v15-clean
-postgres-v15-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
-
-neon-pg-ext-v14: postgres-v14
-	+@echo "Compiling neon v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v14
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
-
-neon-pg-ext-v15: postgres-v15
-	+@echo "Compiling neon v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v15
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
+.PHONY: neon-pg-ext
+neon-pg-ext: \
+	neon-pg-ext-v14 \
+	neon-pg-ext-v15
 
 .PHONY: neon-pg-ext-clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
+neon-pg-ext-clean: \
+	neon-pg-ext-clean-v14 \
+	neon-pg-ext-clean-v15
 
-neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
-postgres-headers: postgres-v14-headers postgres-v15-headers
-postgres-clean: postgres-v14-clean postgres-v15-clean
+# shorthand to build all Postgres versions
+.PHONY: postgres
+postgres: \
+	postgres-v14 \
+	postgres-v15
+
+.PHONY: postgres-headers
+postgres-headers: \
+	postgres-headers-v14 \
+	postgres-headers-v15
+
+.PHONY: postgres-clean
+postgres-clean: \
+	postgres-clean-v14 \
+	postgres-clean-v15
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean:
-	cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
-	cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
+clean: postgres-clean neon-pg-ext-clean
 	$(CARGO_CMD_PREFIX) cargo clean
-	cd pgxn/neon && $(MAKE) clean
-	cd pgxn/neon_test_utils && $(MAKE) clean
 
 # This removes everything
 .PHONY: distclean

From b58f7710ff4d3c8772dee43e98a7166678706177 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Dec 2022 23:05:19 +0100
Subject: [PATCH 085/167] seqwait: different error messages per variant

Would have been handy to get slightly more details in
https://github.com/neondatabase/neon/issues/3109

refs https://github.com/neondatabase/neon/issues/3109
---
 libs/utils/src/seqwait.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index bf330a482c..e3f0b505da 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -11,11 +11,13 @@ use tokio::time::timeout;
 
 /// An error happened while waiting for a number
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
-#[error("SeqWaitError")]
 pub enum SeqWaitError {
     /// The wait timeout was reached
+    #[error("seqwait timeout was reached")]
     Timeout,
+
     /// [`SeqWait::shutdown`] was called
+    #[error("SeqWait::shutdown was called")]
     Shutdown,
 }
 

From 70ce01d84d155b2622d4b0857d20abdcbe7a5b87 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 00:42:30 +0300
Subject: [PATCH 086/167] Deploy broker with L4 LB in new env. (#3125)

Seems to be fixing issue with missing keepalives.
---
 .../ansible/prod.ap-southeast-1.hosts.yaml    |  2 +-
 .github/ansible/prod.eu-central-1.hosts.yaml  |  2 +-
 .github/ansible/prod.us-east-2.hosts.yaml     |  2 +-
 .github/ansible/prod.us-west-2.hosts.yaml     |  2 +-
 .github/ansible/staging.eu-west-1.hosts.yaml  |  2 +-
 .github/ansible/staging.us-east-2.hosts.yaml  |  2 +-
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...ev-us-east-2-beta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...utheast-1-epsilon.neon-storage-broker.yaml | 33 ++++++++-----------
 ...u-central-1-gamma.neon-storage-broker.yaml | 33 ++++++++-----------
 ...d-us-east-2-delta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...rod-us-west-2-eta.neon-storage-broker.yaml | 33 ++++++++-----------
 .github/workflows/build_and_test.yml          |  4 +--
 13 files changed, 92 insertions(+), 122 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index bcc7bb3b16..648029c120 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 2b372d0fcb..c285a9f3b6 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7a4002ec88..1753068b8c 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 682ee5994d..7d6e49bf9c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 90f00175b0..cfcc3a9ae8 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index d2b7fae12a..78a4582e57 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index e876367a18..c6e682f571 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index dcf4b99de2..c7682d24c0 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.beta.us-east-2.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 0abc6ebaa1..92b1777d0b 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index d44a3eab5c..f89df4533a 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index b9eeff5681..8cbc1af7cf 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.delta.us-east-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 249f76303a..8a7488948d 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.eta.us-west-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7a887cbece..43b855a2b0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1072,7 +1072,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1149,7 +1149,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 6dec85b19d63fd18b7f64d65d131fae6842f39f0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Dec 2022 12:39:46 +0200
Subject: [PATCH 087/167] Redefine the timeline_gc API to not perform a forced
 compaction

Previously, the /v1/tenant/:tenant_id/timeline/:timeline_id/do_gc API
call performed a flush and compaction on the timeline before
GC. Change it not to do that, and change all the tests that used that
API to perform compaction explicitly.

The compaction happens at a slightly different point now. Previously,
the code performed the `refresh_gc_info_internal` step first, and only
then did compaction on all the timelines. I don't think that was what
was originally intended here. Presumably the idea with compaction was
to make some old layer files available for GC. But if we're going to
flush the current in-memory layer to disk, surely you would want to
include the newly-written layer in the compaction too. I guess this
didn't make any difference to the tests in practice, but in any case,
the tests now perform the flush and compaction before any of the GC
steps.

Some of the tests might not need the compaction at all, but I didn't
try hard to determine which ones might need it. I left it out from a
few tests that intentionally tested calling do_gc with an invalid
tenant or timeline ID, though.
---
 pageserver/src/tenant.rs                    | 24 +++++----------------
 pageserver/src/tenant_mgr.rs                |  2 +-
 pageserver/src/tenant_tasks.rs              |  2 +-
 test_runner/fixtures/compare_fixtures.py    |  1 +
 test_runner/regress/test_branch_and_gc.py   |  2 ++
 test_runner/regress/test_branch_behind.py   |  1 +
 test_runner/regress/test_gc_aggressive.py   |  9 ++++----
 test_runner/regress/test_import.py          |  1 +
 test_runner/regress/test_old_request_lsn.py |  1 +
 test_runner/regress/test_pitr_gc.py         |  1 +
 test_runner/regress/test_tenant_detach.py   |  1 +
 test_runner/regress/test_timeline_size.py   |  1 -
 12 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4fcb1e3ba3..0e59b43dda 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1164,7 +1164,6 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
-        checkpoint_before_gc: bool,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
@@ -1179,7 +1178,7 @@ impl Tenant {
             let _timer = STORAGE_TIME
                 .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
                 .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
+            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
                 .await
         }
     }
@@ -1778,7 +1777,6 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
-        checkpoint_before_gc: bool,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
@@ -1805,18 +1803,6 @@ impl Tenant {
                 // made.
                 break;
             }
-
-            // If requested, force flush all in-memory layers to disk first,
-            // so that they too can be garbage collected. That's
-            // used in tests, so we want as deterministic results as possible.
-            if checkpoint_before_gc {
-                timeline.checkpoint(CheckpointConfig::Forced).await?;
-                info!(
-                    "timeline {} checkpoint_before_gc done",
-                    timeline.timeline_id
-                );
-            }
-
             let result = timeline.gc().await?;
             totals += result;
         }
@@ -2877,7 +2863,7 @@ mod tests {
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
@@ -2933,7 +2919,7 @@ mod tests {
         let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
         let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
         assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
         match tline.get(*TEST_KEY, Lsn(0x25)) {
@@ -2960,7 +2946,7 @@ mod tests {
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
         assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
 
@@ -2985,7 +2971,7 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
 
         // Check that the data is still accessible on the branch.
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index f4f1eba717..615dcce4a1 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -496,7 +496,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs
index d3aec933c2..d71f244725 100644
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 291f924379..530e5afaab 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -115,6 +115,7 @@ class NeonCompare(PgCompare):
         return self._pg_bin
 
     def flush(self):
+        self.pageserver_http.timeline_checkpoint(self.env.initial_tenant, self.timeline)
         self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0)
 
     def compact(self):
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index dfbf956568..cc807b7ff3 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -84,6 +84,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
 
     # Set the GC horizon so that lsn1 is inside the horizon, which means
     # we can create a new branch starting from lsn1.
+    pageserver_http_client.timeline_checkpoint(tenant, timeline_main)
     pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024)
 
     env.neon_cli.create_branch(
@@ -156,6 +157,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     # branch creation task but the individual timeline GC iteration happens *after*
     # the branch creation task.
     pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)"))
+    pageserver_http_client.timeline_checkpoint(tenant, b0)
 
     def do_gc():
         pageserver_http_client.timeline_gc(tenant, b0, 0)
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index a841e3ced2..d19f6a7d39 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -109,6 +109,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
 
     # check that we cannot create branch based on garbage collected data
     with env.pageserver.http_client() as pageserver_http:
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
         print_gc_result(gc_result)
 
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 332bef225f..92855899f0 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -35,12 +35,13 @@ async def gc(env: NeonEnv, timeline: TimelineId):
 
     loop = asyncio.get_running_loop()
 
+    def do_gc():
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+        pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+
     with concurrent.futures.ThreadPoolExecutor() as pool:
         while updates_performed < updates_to_perform:
-            await loop.run_in_executor(
-                pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-            )
-
+            await loop.run_in_executor(pool, do_gc)
 
 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 1a99d13a0b..fb1bc4839e 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -306,6 +306,7 @@ def _import(
 
     # Check that gc works
     pageserver_http = env.pageserver.http_client()
+    pageserver_http.timeline_checkpoint(tenant, timeline)
     pageserver_http.timeline_gc(tenant, timeline, 0)
 
     return tar_output_file
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 1e81d8ba60..9885a811e1 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -59,6 +59,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # Make a lot of updates on a single row, generating a lot of WAL. Trigger
     # garbage collections so that the page server will remove old page versions.
     for i in range(10):
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
         print_gc_result(gc_result)
 
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index d8b7256577..fe4fbc0927 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -52,6 +52,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
 
     # run GC
     with env.pageserver.http_client() as pageserver_http:
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         pageserver_http.timeline_compact(env.initial_tenant, timeline)
         # perform aggressive GC. Data still should be kept because of the PITR setting.
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 59811c565c..ce1e334bfa 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -24,6 +24,7 @@ def do_gc_target(
     """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
     try:
         log.info("sending gc http request")
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
         pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
     except Exception as e:
         log.error("do_gc failed: %s", e)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cef1f365cd..4b70c2ea18 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -326,7 +326,6 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
 
     wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
-
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
     assert_physical_size(env, env.initial_tenant, new_timeline_id)

From c262390214109d46be2c230f1d52948e7134f76d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Dec 2022 12:39:50 +0200
Subject: [PATCH 088/167] Don't upload index file when GC doesn't remove
 anything.

I saw an excessive number of index file upload operations in
production, even when nothing on the timeline changes. It was because
our GC schedules index file upload if the GC cutoff LSN is advanced,
even if the GC had nothing else to do. The GC cutoff LSN marches
steadily forwards, even when there is no user activity on the
timeline, when the cutoff is determined by the time-based PITR
interval setting. To dial that down, only schedule index file upload
when GC is about to actually remove something.
---
 pageserver/src/tenant/timeline.rs         | 51 +++++++-------
 test_runner/regress/test_gc_aggressive.py | 86 ++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a746fd9bf8..cc6583dcf6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2487,9 +2487,6 @@ impl Timeline {
             );
             write_guard.store_and_unlock(new_gc_cutoff).wait();
         }
-        // Persist the new GC cutoff value in the metadata file, before
-        // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
 
         info!("GC starting");
 
@@ -2600,19 +2597,33 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
-        // Actually delete the layers from disk and remove them from the map.
-        // (couldn't do this in the loop above, because you cannot modify a collection
-        // while iterating it. BTreeMap::retain() would be another option)
-        let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
-        for doomed_layer in layers_to_remove {
-            let path = doomed_layer.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
-            layer_names_to_delete.push(doomed_layer.filename());
-            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer);
-            result.layers_removed += 1;
+        if !layers_to_remove.is_empty() {
+            // Persist the new GC cutoff value in the metadata file, before
+            // we actually remove anything.
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+
+            // Actually delete the layers from disk and remove them from the map.
+            // (couldn't do this in the loop above, because you cannot modify a collection
+            // while iterating it. BTreeMap::retain() would be another option)
+            let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
+            for doomed_layer in layers_to_remove {
+                let path = doomed_layer.local_path();
+                self.metrics
+                    .current_physical_size_gauge
+                    .sub(path.metadata()?.len());
+                layer_names_to_delete.push(doomed_layer.filename());
+                doomed_layer.delete()?;
+                layers.remove_historic(doomed_layer);
+                result.layers_removed += 1;
+            }
+
+            if result.layers_removed != 0 {
+                fail_point!("after-timeline-gc-removed-layers");
+            }
+
+            if let Some(remote_client) = &self.remote_client {
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            }
         }
 
         info!(
@@ -2620,14 +2631,6 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        if result.layers_removed != 0 {
-            fail_point!("after-timeline-gc-removed-layers");
-        }
-
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
-        }
-
         result.elapsed = now.elapsed()?;
         Ok(result)
     }
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 92855899f0..b9d012fa36 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,9 +2,17 @@ import asyncio
 import concurrent.futures
 import random
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
-from fixtures.types import TimelineId
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    Postgres,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 # Test configuration
@@ -43,6 +51,7 @@ async def gc(env: NeonEnv, timeline: TimelineId):
         while updates_performed < updates_to_perform:
             await loop.run_in_executor(pool, do_gc)
 
+
 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
     workers = []
@@ -88,3 +97,76 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
         r = cur.fetchone()
         assert r is not None
         assert r == (num_rows, updates_to_perform)
+
+
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_gc_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_gc_index_upload", "main")
+    pg = env.postgres.create_start("test_gc_index_upload")
+
+    pageserver_http = env.pageserver.http_client()
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # Sanity check that the metric works
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    before = get_num_remote_ops("index", "upload")
+    assert before > 0
+
+    # Run many cycles of GC. Then check that the number of index files
+    # uploads didn't grow much. In particular we don't want to re-upload the
+    # index file on every GC iteration, when it has no work to do.
+    #
+    # On each iteration, we use a slightly smaller GC horizon, so that the GC
+    # at least needs to check if it has work to do.
+    for i in range(100):
+        cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
+        num_index_uploads = get_num_remote_ops("index", "upload")
+        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
+
+    after = num_index_uploads
+    log.info(f"{after-before} new index uploads during test")
+    assert after - before < 5

From e14bbb889a24c06ea34314c1c864eec2c10c0d4e Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 12:55:12 +0300
Subject: [PATCH 089/167] Enable broker client keepalives. (#3127)

Should fix stale connections.

ref https://github.com/neondatabase/neon/issues/3108
---
 pageserver/src/config.rs                 | 19 ++++++++++++
 pageserver/src/walreceiver.rs            | 11 ++++---
 safekeeper/src/bin/safekeeper.rs         |  4 +++
 safekeeper/src/broker.rs                 |  2 +-
 safekeeper/src/lib.rs                    |  2 ++
 storage_broker/benches/rps.rs            |  6 ++--
 storage_broker/src/bin/storage_broker.rs | 38 ++++++++++++++----------
 storage_broker/src/lib.rs                |  9 +++++-
 8 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 48e9f32276..9971ddc0f7 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -137,6 +137,7 @@ pub struct PageServerConf {
 
     /// Storage broker endpoints to connect to.
     pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,
 
     pub log_format: LogFormat,
 
@@ -215,6 +216,7 @@ struct PageServerConfigBuilder {
 
     profiling: BuilderValue<ProfilingConfig>,
     broker_endpoint: BuilderValue<Uri>,
+    broker_keepalive_interval: BuilderValue<Duration>,
 
     log_format: BuilderValue<LogFormat>,
 
@@ -247,6 +249,10 @@ impl Default for PageServerConfigBuilder {
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: Set(humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
@@ -310,6 +316,10 @@ impl PageServerConfigBuilder {
         self.broker_endpoint = BuilderValue::Set(broker_endpoint)
     }
 
+    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
+        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
+    }
+
     pub fn id(&mut self, node_id: NodeId) {
         self.id = BuilderValue::Set(node_id)
     }
@@ -365,6 +375,9 @@ impl PageServerConfigBuilder {
             broker_endpoint: self
                 .broker_endpoint
                 .ok_or(anyhow!("No broker endpoints provided"))?,
+            broker_keepalive_interval: self
+                .broker_keepalive_interval
+                .ok_or(anyhow!("No broker keepalive interval provided"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
             concurrent_tenant_size_logical_size_queries: self
                 .concurrent_tenant_size_logical_size_queries
@@ -532,6 +545,7 @@ impl PageServerConf {
                 "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                 "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
+                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
                     LogFormat::from_config(&parse_toml_string(key, item)?)?
                 ),
@@ -659,6 +673,7 @@ impl PageServerConf {
             profiling: ProfilingConfig::Disabled,
             default_tenant_conf: TenantConf::dummy_conf(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+            broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
         }
@@ -829,6 +844,9 @@ log_format = 'json'
                 profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: humantime::parse_duration(
+                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
+                )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             },
@@ -872,6 +890,7 @@ log_format = 'json'
                 profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             },
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs
index 74ede7c213..aaf46579a7 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -44,10 +44,13 @@ pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result
     let broker_endpoint = conf.broker_endpoint.clone();
 
     // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!(
-        "Failed to create broker client to {}",
-        &conf.broker_endpoint
-    ))?;
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
 
     if BROKER_CLIENT.set(broker_client).is_err() {
         panic!("broker already initialized");
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index cab5053b5b..275253d1d4 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -82,6 +82,9 @@ struct Args {
     /// established; plaintext otherwise.
     #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)]
     broker_endpoint: Uri,
+    /// Broker keepalive interval.
+    #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)]
+    broker_keepalive_interval: Duration,
     /// Peer safekeeper is considered dead after not receiving heartbeats from
     /// it during this period passed as a human readable duration.
     #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
@@ -142,6 +145,7 @@ fn main() -> anyhow::Result<()> {
         listen_http_addr: args.listen_http,
         no_sync: args.no_sync,
         broker_endpoint: args.broker_endpoint,
+        broker_keepalive_interval: args.broker_keepalive_interval,
         heartbeat_timeout: args.heartbeat_timeout,
         remote_storage: args.remote_storage,
         max_offloader_lag_bytes: args.max_offloader_lag,
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index df2dc92efe..92f35bf51f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -66,7 +66,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
 /// Subscribe and fetch all the interesting data from the broker.
 async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
-    let mut client = storage_broker::connect(conf.broker_endpoint)?;
+    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
     let request = SubscribeSafekeeperInfoRequest {
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 60a1911068..5decfe64de 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -51,6 +51,7 @@ pub struct SafeKeeperConf {
     pub listen_http_addr: String,
     pub no_sync: bool,
     pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,
     pub heartbeat_timeout: Duration,
     pub remote_storage: Option<RemoteStorageConfig>,
     pub max_offloader_lag_bytes: u64,
@@ -83,6 +84,7 @@ impl SafeKeeperConf {
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint"),
+            broker_keepalive_interval: Duration::from_secs(5),
             backup_runtime_threads: None,
             wal_backup_enabled: true,
             auth_validation_public_key_path: None,
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 73141318b8..1262bd9333 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -88,7 +88,7 @@ fn tli_from_u64(i: u64) -> Vec<u8> {
 async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
 
     let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
@@ -112,7 +112,7 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
 async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
     let mut counter: u64 = 0;
 
@@ -152,7 +152,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     let h = tokio::spawn(progress_reporter(counters.clone()));
 
-    let c = storage_broker::connect(DEFAULT_ENDPOINT).unwrap();
+    let c = storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap();
 
     for i in 0..args.num_subs {
         let c = Some(c.clone());
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index fdf2637b4d..6d80e96bf1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -39,7 +39,9 @@ use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
-use storage_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR};
+use storage_broker::{
+    parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
+};
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
@@ -47,8 +49,8 @@ use utils::sentry_init::{init_sentry, release_name};
 
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_CHAN_SIZE: usize = 128;
-const DEFAULT_HTTP2_KEEPALIVE_INTERVAL: &str = "5000ms";
+const DEFAULT_CHAN_SIZE: usize = 32;
+const DEFAULT_ALL_KEYS_CHAN_SIZE: usize = 16384;
 
 #[derive(Parser, Debug)]
 #[command(version = GIT_VERSION, about = "Broker for neon storage nodes communication", long_about = None)]
@@ -56,11 +58,14 @@ struct Args {
     /// Endpoint to listen on.
     #[arg(short, long, default_value = DEFAULT_LISTEN_ADDR)]
     listen_addr: SocketAddr,
-    /// Size of the queue to the subscriber.
+    /// Size of the queue to the per timeline subscriber.
     #[arg(long, default_value_t = DEFAULT_CHAN_SIZE)]
-    chan_size: usize,
+    timeline_chan_size: usize,
+    /// Size of the queue to the all keys subscriber.
+    #[arg(long, default_value_t = DEFAULT_ALL_KEYS_CHAN_SIZE)]
+    all_keys_chan_size: usize,
     /// HTTP/2 keepalive interval.
-    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HTTP2_KEEPALIVE_INTERVAL)]
+    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_KEEPALIVE_INTERVAL)]
     http2_keepalive_interval: Duration,
     /// Format for logging, either 'plain' or 'json'.
     #[arg(long, default_value = "plain")]
@@ -108,7 +113,7 @@ struct SharedState {
 }
 
 impl SharedState {
-    pub fn new(chan_size: usize) -> Self {
+    pub fn new(all_keys_chan_size: usize) -> Self {
         SharedState {
             next_pub_id: 0,
             num_pubs: 0,
@@ -116,7 +121,7 @@ impl SharedState {
             num_subs_to_timelines: 0,
             chans_to_timeline_subs: HashMap::new(),
             num_subs_to_all: 0,
-            chan_to_all_subs: broadcast::channel(chan_size).0,
+            chan_to_all_subs: broadcast::channel(all_keys_chan_size).0,
         }
     }
 
@@ -139,7 +144,7 @@ impl SharedState {
     pub fn register_subscriber(
         &mut self,
         sub_key: SubscriptionKey,
-        chan_size: usize,
+        timeline_chan_size: usize,
     ) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
         let sub_id = self.next_sub_id;
         self.next_sub_id += 1;
@@ -158,7 +163,7 @@ impl SharedState {
                     self.chans_to_timeline_subs
                         .entry(ttid)
                         .or_insert(ChanToTimelineSub {
-                            chan: broadcast::channel(chan_size).0,
+                            chan: broadcast::channel(timeline_chan_size).0,
                             num_subscribers: 0,
                         });
                 chan_to_timeline_sub.num_subscribers += 1;
@@ -200,7 +205,7 @@ impl SharedState {
 #[derive(Clone)]
 struct Registry {
     shared_state: Arc<RwLock<SharedState>>,
-    chan_size: usize,
+    timeline_chan_size: usize,
 }
 
 impl Registry {
@@ -232,7 +237,7 @@ impl Registry {
         let (sub_id, sub_rx) = self
             .shared_state
             .write()
-            .register_subscriber(sub_key, self.chan_size);
+            .register_subscriber(sub_key, self.timeline_chan_size);
         info!(
             "subscription started id={}, key={:?}, addr={:?}",
             sub_id, sub_key, remote_addr
@@ -369,7 +374,8 @@ impl BrokerService for Broker {
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
                         if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
-                            warn!("dropped {} messages, channel is full", missed_msgs);
+                            warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
                             missed_msgs = 0;
                         }
                     }
@@ -427,8 +433,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     info!("version: {GIT_VERSION}");
 
     let registry = Registry {
-        shared_state: Arc::new(RwLock::new(SharedState::new(args.chan_size))),
-        chan_size: args.chan_size,
+        shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))),
+        timeline_chan_size: args.timeline_chan_size,
     };
     let storage_broker_impl = Broker {
         registry: registry.clone(),
@@ -522,7 +528,7 @@ mod tests {
     async fn test_registry() {
         let registry = Registry {
             shared_state: Arc::new(RwLock::new(SharedState::new(16))),
-            chan_size: 16,
+            timeline_chan_size: 16,
         };
 
         // subscribe to timeline 2
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 0629caa2fb..d12a79a69f 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,6 +1,7 @@
 use hyper::body::HttpBody;
 use std::pin::Pin;
 use std::task::{Context, Poll};
+use std::time::Duration;
 use tonic::codegen::StdError;
 use tonic::transport::{ClientTlsConfig, Endpoint};
 use tonic::{transport::Channel, Code, Status};
@@ -26,6 +27,8 @@ pub use hyper::Uri;
 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
 
+pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
+
 // BrokerServiceClient charged with tonic provided Channel transport; helps to
 // avoid depending on tonic directly in user crates.
 pub type BrokerClientChannel = BrokerServiceClient<Channel>;
@@ -33,7 +36,7 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
-pub fn connect<U>(endpoint: U) -> anyhow::Result<BrokerClientChannel>
+pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
     U: std::convert::TryInto<Uri>,
     U::Error: std::error::Error + Send + Sync + 'static,
@@ -46,6 +49,10 @@ where
         let tls = ClientTlsConfig::new();
         tonic_endpoint = tonic_endpoint.tls_config(tls)?;
     }
+    tonic_endpoint = tonic_endpoint
+        .http2_keep_alive_interval(keepalive_interval)
+        .keep_alive_while_idle(true);
+    //  keep_alive_timeout is 20s by default on both client and server side
     let channel = tonic_endpoint.connect_lazy();
     Ok(BrokerClientChannel::new(channel))
 }

From b688a538e3ff843f2acf1b33948aa519b5477ce4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 13:40:01 +0200
Subject: [PATCH 090/167] fix(remote_storage): use cached credentials (#3128)

IMDSv2 has limits, and if we query it on every s3 interaction we are
going to go over those limits. Changes the s3_bucket client
configuration to use:
- ChainCredentialsProvider to handle env variables or imds usage
- LazyCachingCredentialsProvider to actually cache any credentials

Related: https://github.com/awslabs/aws-sdk-rust/issues/629
Possibly related: https://github.com/neondatabase/neon/issues/3118
---
 libs/remote_storage/src/s3_bucket.rs | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ab1e5da6c5..740f3753d8 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,13 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::env::var;
 use std::sync::Arc;
-use std::time::Duration;
 
 use anyhow::Context;
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
 };
 use aws_sdk_s3::{
     config::Config,
@@ -20,7 +19,6 @@ use aws_sdk_s3::{
     Client, Endpoint, Region,
 };
 use aws_smithy_http::body::SdkBody;
-use aws_types::credentials::{CredentialsError, ProvideCredentials};
 use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
@@ -31,8 +29,6 @@ use crate::{
     Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(super) mod metrics {
     use metrics::{register_int_counter_vec, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -122,30 +118,23 @@ impl S3Bucket {
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
         );
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            // uses imds v2
+            let imds = ImdsCredentialsProvider::builder().build();
+
+            // finally add caching.
+            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
+            LazyCachingCredentialsProvider::builder()
+                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
+                .build()
+        };
+
         let mut config_builder = Config::builder()
             .region(Region::new(aws_config.bucket_region.clone()))
-            .credentials_provider(provide_credentials_fn(|| async {
-                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
-                    true => {
-                        EnvironmentVariableCredentialsProvider::new()
-                            .provide_credentials()
-                            .await
-                    }
-                    false => {
-                        let imds_client = imds::Client::builder()
-                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .build()
-                            .await
-                            .map_err(CredentialsError::unhandled)?;
-                        ImdsCredentialsProvider::builder()
-                            .imds_client(imds_client)
-                            .build()
-                            .provide_credentials()
-                            .await
-                    }
-                }
-            }));
+            .credentials_provider(credentials_provider);
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
             let endpoint = Endpoint::immutable(

From 8d39fcdf728d9929cee5416371bf10b997ca5e2a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 16 Dec 2022 13:23:36 +0000
Subject: [PATCH 091/167] pgbench-compare: don't run neon-captest-new (#3130)

Do not run Nightly Benchmarks on `neon-captest-new`.
This is a temporary solution to avoid spikes in the storage we consume
during the test run. To collect data for the default instance, we could
run tests weekly (i.e. not daily).
---
 .github/workflows/benchmarking.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index e3e0f1e820..07e111b67c 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -18,6 +18,7 @@ on:
       region_id:
         description: 'Use a particular region. If not set the default region will be used'
         required: false
+        default: 'aws-us-east-2'
       save_perf_report:
         type: boolean
         description: 'Publish perf report or not. If not set, the report is published only for the main branch'
@@ -115,13 +116,10 @@ jobs:
         # neon-captest-prefetch: Same, with prefetching enabled (new project)
         # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
         # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
         db_size: [ 10gb ]
         runner: [ us-east-2 ]
         include:
-          - platform: neon-captest-new
-            db_size: 50gb
-            runner: us-east-2
           - platform: neon-captest-prefetch
             db_size: 50gb
             runner: us-east-2

From c86c0c08ef769269c4b827967525a35880d14413 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 17:19:47 +0200
Subject: [PATCH 092/167] task_mgr: use CancellationToken instead of
 shutdown_rx (#3124)

this should help us in the future to have more freedom with spawning
tasks and cancelling things, most importantly blocking tasks (assuming
the CancellationToken::is_cancelled is performant enough).
CancellationToken allows creation of hierarchical cancellations, which
would also simplify the task_mgr shutdown operation, rendering it
unnecessary.
---
 pageserver/src/task_mgr.rs | 55 +++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 3325ce01d4..91719fb3af 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -25,7 +25,6 @@
 //! the current task has been requested to shut down. You can use that with
 //! Tokio select!().
 //!
-//!
 //! TODO: This would be a good place to also handle panics in a somewhat sane way.
 //! Depending on what task panics, we might want to kill the whole server, or
 //! only a single tenant or timeline.
@@ -43,9 +42,9 @@ use std::sync::{Arc, Mutex};
 
 use futures::FutureExt;
 use tokio::runtime::Runtime;
-use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio::task_local;
+use tokio_util::sync::CancellationToken;
 
 use tracing::{debug, error, info, warn};
 
@@ -146,11 +145,10 @@ static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
     Lazy::new(|| Mutex::new(HashMap::new()));
 
 task_local! {
-    // There is a Tokio watch channel for each task, which can be used to signal the
-    // task that it needs to shut down. This task local variable holds the receiving
-    // end of the channel. The sender is kept in the global registry, so that anyone
-    // can send the signal to request task shutdown.
-    static SHUTDOWN_RX: watch::Receiver<bool>;
+    // This is a cancellation token which will be cancelled when a task needs to shut down. The
+    // root token is kept in the global registry, so that anyone can send the signal to request
+    // task shutdown.
+    static SHUTDOWN_TOKEN: CancellationToken;
 
     // Each task holds reference to its own PageServerTask here.
     static CURRENT_TASK: Arc<PageServerTask>;
@@ -226,8 +224,8 @@ struct PageServerTask {
 
     name: String,
 
-    // To request task shutdown, send 'true' to the channel to notify the task.
-    shutdown_tx: watch::Sender<bool>,
+    // To request task shutdown, just cancel this token.
+    cancel: CancellationToken,
 
     mutable: Mutex<MutableTaskState>,
 }
@@ -247,13 +245,13 @@ pub fn spawn<F>(
 where
     F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
-    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let cancel = CancellationToken::new();
     let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
     let task = Arc::new(PageServerTask {
         task_id: PageserverTaskId(task_id),
         kind,
         name: name.to_string(),
-        shutdown_tx,
+        cancel: cancel.clone(),
         mutable: Mutex::new(MutableTaskState {
             tenant_id,
             timeline_id,
@@ -271,7 +269,7 @@ where
         task_name,
         task_id,
         task_cloned,
-        shutdown_rx,
+        cancel,
         shutdown_process_on_error,
         future,
     ));
@@ -288,7 +286,7 @@ async fn task_wrapper<F>(
     task_name: String,
     task_id: u64,
     task: Arc<PageServerTask>,
-    shutdown_rx: watch::Receiver<bool>,
+    shutdown_token: CancellationToken,
     shutdown_process_on_error: bool,
     future: F,
 ) where
@@ -296,9 +294,9 @@ async fn task_wrapper<F>(
 {
     debug!("Starting task '{}'", task_name);
 
-    let result = SHUTDOWN_RX
+    let result = SHUTDOWN_TOKEN
         .scope(
-            shutdown_rx,
+            shutdown_token,
             CURRENT_TASK.scope(task, {
                 // We use AssertUnwindSafe here so that the payload function
                 // doesn't need to be UnwindSafe. We don't do anything after the
@@ -408,7 +406,7 @@ pub async fn shutdown_tasks(
                 && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
                 && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
             {
-                let _ = task.shutdown_tx.send_replace(true);
+                task.cancel.cancel();
                 victim_tasks.push(Arc::clone(task));
             }
         }
@@ -439,21 +437,28 @@ pub fn current_task_kind() -> Option<TaskKind> {
 /// A Future that can be used to check if the current task has been requested to
 /// shut down.
 pub async fn shutdown_watcher() {
-    let mut shutdown_rx = SHUTDOWN_RX
-        .try_with(|rx| rx.clone())
+    let token = SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
         .expect("shutdown_requested() called in an unexpected task or thread");
 
-    while !*shutdown_rx.borrow() {
-        if shutdown_rx.changed().await.is_err() {
-            break;
-        }
-    }
+    token.cancelled().await;
+}
+
+/// Clone the current task's cancellation token, which can be moved across tasks.
+///
+/// When the task which is currently executing is shutdown, the cancellation token will be
+/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
+/// `tokio::task::JoinSet::spawn`.
+pub fn shutdown_token() -> CancellationToken {
+    SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
+        .expect("shutdown_token() called in an unexpected task or thread")
 }
 
 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
-        *shutdown_rx.borrow()
+    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
+        cancel.is_cancelled()
     } else {
         if !cfg!(test) {
             warn!("is_shutdown_requested() called in an unexpected task or thread");

From 64775a0a756c693b23c84f54fddbdcce5b1d5f3c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 16 Dec 2022 17:45:38 +0000
Subject: [PATCH 093/167] test_runner/performance: fix flush for NeonCompare
 (#3135)

Fix performance tests:
```
AttributeError: 'NeonCompare' object has no attribute 'pageserver_http'
```
---
 test_runner/fixtures/compare_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 530e5afaab..fa488c4446 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -115,7 +115,7 @@ class NeonCompare(PgCompare):
         return self._pg_bin
 
     def flush(self):
-        self.pageserver_http.timeline_checkpoint(self.env.initial_tenant, self.timeline)
+        self.pageserver_http_client.timeline_checkpoint(self.env.initial_tenant, self.timeline)
         self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0)
 
     def compact(self):

From 83baf49487213c2e1f03ea5d6b3f71c8b3c9f49d Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Thu, 15 Dec 2022 18:10:43 +0300
Subject: [PATCH 094/167] [proxy] Forward compute connection params to client

This fixes all kinds of problems related to missing params,
like broken timestamps (due to `integer_datetimes`).

This solution is not ideal, but it will help. Meanwhile,
I'm going to dedicate some time to improving connection machinery.

Note that this **does not** fix problems with passing certain parameters
in a reverse direction, i.e. **from client to compute**. This is a
separate matter and will be dealt with in an upcoming PR.
---
 Cargo.lock                               |  8 +--
 Cargo.toml                               |  2 +-
 compute_tools/Cargo.toml                 |  4 +-
 control_plane/Cargo.toml                 |  2 +-
 libs/postgres_connection/Cargo.toml      |  4 +-
 libs/postgres_ffi/Cargo.toml             |  2 +-
 libs/postgres_ffi/wal_craft/Cargo.toml   |  2 +-
 libs/pq_proto/Cargo.toml                 |  2 +-
 libs/pq_proto/src/lib.rs                 | 64 +++++++++++++-----------
 libs/utils/src/postgres_backend.rs       | 10 ++--
 libs/utils/src/postgres_backend_async.rs | 10 ++--
 pageserver/Cargo.toml                    |  8 +--
 proxy/Cargo.toml                         |  2 +-
 proxy/src/auth/backend/link.rs           |  4 +-
 proxy/src/compute.rs                     | 52 ++++++++++---------
 proxy/src/proxy.rs                       | 18 ++++---
 proxy/src/proxy/tests.rs                 |  2 +-
 safekeeper/Cargo.toml                    |  6 +--
 test_runner/regress/test_proxy.py        | 30 +++++++++++
 19 files changed, 137 insertions(+), 95 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 913b39da0f..1eb27fb0f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2613,7 +2613,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.2"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2626,7 +2626,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "base64",
  "byteorder",
@@ -2644,7 +2644,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.3"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4010,7 +4010,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 2f73215d3f..0e098d91ee 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,4 +86,4 @@ lto = true
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index a35cef197d..6240073cb3 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -12,12 +12,12 @@ futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 notify = "5.0.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 00b34aafb1..9d9d6a5f11 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,7 +10,7 @@ comfy-table = "6.1"
 git-version = "0.3.5"
 nix = "0.25"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 regex = "1"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 314f3c6f1c..25db64337d 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -8,8 +8,8 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 itertools = "0.10.3"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 01ff6ab60e..bafc587e80 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [dev-dependencies]
 env_logger = "0.9"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 wal_craft = { path = "wal_craft" }
 
 [build-dependencies]
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 4c35c5a650..3a22e9d789 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -11,7 +11,7 @@ clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 4d48e431b4..dc38abd64b 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 bytes = "1.0.1"
 pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 2e311dd6e3..0d698127b9 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -463,7 +463,10 @@ pub enum BeMessage<'a> {
     EncryptionResponse(bool),
     NoData,
     ParameterDescription,
-    ParameterStatus(BeParameterStatusMessage<'a>),
+    ParameterStatus {
+        name: &'a [u8],
+        value: &'a [u8],
+    },
     ParseComplete,
     ReadyForQuery,
     RowDescription(&'a [RowDescriptor<'a>]),
@@ -472,6 +475,28 @@ pub enum BeMessage<'a> {
     KeepAlive(WalSndKeepAlive),
 }
 
+/// Common shorthands.
+impl<'a> BeMessage<'a> {
+    /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8.
+    /// This is a sensible default, given that:
+    ///  * rust strings only support this encoding out of the box.
+    ///  * tokio-postgres, postgres-jdbc (and probably more) mandate it.
+    ///
+    /// TODO: do we need to report `server_encoding` as well?
+    pub const CLIENT_ENCODING: Self = Self::ParameterStatus {
+        name: b"client_encoding",
+        value: b"UTF8",
+    };
+
+    /// Build a [`BeMessage::ParameterStatus`] holding the server version.
+    pub fn server_version(version: &'a str) -> Self {
+        Self::ParameterStatus {
+            name: b"server_version",
+            value: version.as_bytes(),
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum BeAuthenticationSaslMessage<'a> {
     Methods(&'a [&'a str]),
@@ -485,12 +510,6 @@ pub enum BeParameterStatusMessage<'a> {
     ServerVersion(&'a str),
 }
 
-impl BeParameterStatusMessage<'static> {
-    pub fn encoding() -> BeMessage<'static> {
-        BeMessage::ParameterStatus(Self::Encoding("UTF8"))
-    }
-}
-
 // One row description in RowDescription packet.
 #[derive(Debug)]
 pub struct RowDescriptor<'a> {
@@ -587,14 +606,15 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }
 
 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
-    if s.contains(&0) {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
+    let bytes = s.as_ref();
+    if bytes.contains(&0) {
         return Err(io::Error::new(
             io::ErrorKind::InvalidInput,
             "string contains embedded null",
         ));
     }
-    buf.put_slice(s);
+    buf.put_slice(bytes);
     buf.put_u8(0);
     Ok(())
 }
@@ -644,7 +664,7 @@ impl<'a> BeMessage<'a> {
                         Methods(methods) => {
                             buf.put_i32(10); // Specifies that SASL auth method is used.
                             for method in methods.iter() {
-                                write_cstr(method.as_bytes(), buf)?;
+                                write_cstr(method, buf)?;
                             }
                             buf.put_u8(0); // zero terminator for the list
                         }
@@ -759,7 +779,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"CXX000\0");
 
                     buf.put_u8(b'M'); // the message
-                    write_cstr(error_msg.as_bytes(), buf)?;
+                    write_cstr(error_msg, buf)?;
 
                     buf.put_u8(0); // terminator
                     Ok::<_, io::Error>(())
@@ -799,24 +819,12 @@ impl<'a> BeMessage<'a> {
                 buf.put_u8(response);
             }
 
-            BeMessage::ParameterStatus(param) => {
-                use std::io::{IoSlice, Write};
-                use BeParameterStatusMessage::*;
-
-                let [name, value] = match param {
-                    Encoding(name) => [b"client_encoding", name.as_bytes()],
-                    ServerVersion(version) => [b"server_version", version.as_bytes()],
-                };
-
-                // Parameter names and values are passed as null-terminated strings
-                let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new);
-                let mut buffer = [0u8; 64]; // this should be enough
-                let cnt = buffer.as_mut().write_vectored(iov).unwrap();
-
+            BeMessage::ParameterStatus { name, value } => {
                 buf.put_u8(b'S');
                 write_body(buf, |buf| {
-                    buf.put_slice(&buffer[..cnt]);
-                });
+                    write_cstr(name, buf)?;
+                    write_cstr(value, buf)
+                })?;
             }
 
             BeMessage::ParameterDescription => {
diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index 89f7197718..5b34c7adfb 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -6,7 +6,7 @@
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
@@ -361,11 +361,9 @@ impl PostgresBackend {
                         match self.auth_type {
                             AuthType::Trust => {
                                 self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                                    .write_message_noflush(&BeParameterStatusMessage::encoding())?
+                                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                                     // The async python driver requires a valid server_version
-                                    .write_message_noflush(&BeMessage::ParameterStatus(
-                                        BeParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message_noflush(&BeMessage::server_version("14.1"))?
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
@@ -413,7 +411,7 @@ impl PostgresBackend {
                     }
                 }
                 self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                    .write_message_noflush(&BeParameterStatusMessage::encoding())?
+                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                     .write_message(&BeMessage::ReadyForQuery)?;
                 self.state = ProtoState::Established;
             }
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index 376819027b..a22774c69e 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -6,7 +6,7 @@
 use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
@@ -331,11 +331,9 @@ impl PostgresBackend {
                         match self.auth_type {
                             AuthType::Trust => {
                                 self.write_message(&BeMessage::AuthenticationOk)?
-                                    .write_message(&BeParameterStatusMessage::encoding())?
+                                    .write_message(&BeMessage::CLIENT_ENCODING)?
                                     // The async python driver requires a valid server_version
-                                    .write_message(&BeMessage::ParameterStatus(
-                                        BeParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message(&BeMessage::server_version("14.1"))?
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
@@ -384,7 +382,7 @@ impl PostgresBackend {
                     }
                 }
                 self.write_message(&BeMessage::AuthenticationOk)?
-                    .write_message(&BeParameterStatusMessage::encoding())?
+                    .write_message(&BeMessage::CLIENT_ENCODING)?
                     .write_message(&BeMessage::ReadyForQuery)?;
                 self.state = ProtoState::Established;
             }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 54bbe4714d..9a9bb9bf08 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,9 +36,9 @@ nix = "0.25"
 num-traits = "0.2.15"
 once_cell = "1.13.0"
 pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
@@ -52,7 +52,7 @@ svg_fmt = "0.4.1"
 tar = "0.4.33"
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.36"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 14a5450d5e..68004e5fe2 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -33,7 +33,7 @@ sha2 = "0.10.2"
 socket2 = "0.4.4"
 thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 tokio-rustls = "0.23.0"
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 440a55f194..641519ac50 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,6 @@
 use super::{AuthSuccess, NodeInfo};
 use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
-use pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
@@ -60,7 +60,7 @@ pub async fn handle_user(
         info!(parent: &span, "sending the auth URL to the user");
         client
             .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&BeParameterStatusMessage::encoding())?
+            .write_message_noflush(&Be::CLIENT_ENCODING)?
             .write_message(&Be::NoticeResponse(&greeting))
             .await?;
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 4c5edb9673..71421a4a65 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,18 +8,17 @@ use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
 use tracing::{error, info};
 
+const COULD_NOT_CONNECT: &str = "Could not connect to compute node";
+
 #[derive(Debug, Error)]
 pub enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
     /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such.
-    #[error("Failed to connect to the compute node: {0}")]
+    #[error("{COULD_NOT_CONNECT}: {0}")]
     Postgres(#[from] tokio_postgres::Error),
 
-    #[error("Failed to connect to the compute node")]
-    FailedToConnectToCompute,
-
-    #[error("Failed to fetch compute node version")]
-    FailedToFetchPgVersion,
+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    CouldNotConnect(#[from] io::Error),
 }
 
 impl UserFacingError for ConnectionError {
@@ -29,10 +28,10 @@ impl UserFacingError for ConnectionError {
             // This helps us drop irrelevant library-specific prefixes.
             // TODO: propagate severity level and other parameters.
             Postgres(err) => match err.as_db_error() {
-                Some(err) => err.message().to_string(),
+                Some(err) => err.message().to_owned(),
                 None => err.to_string(),
             },
-            other => other.to_string(),
+            _ => COULD_NOT_CONNECT.to_owned(),
         }
     }
 }
@@ -49,7 +48,7 @@ pub struct ConnCfg(pub tokio_postgres::Config);
 impl ConnCfg {
     /// Construct a new connection config.
     pub fn new() -> Self {
-        Self(tokio_postgres::Config::new())
+        Self(Default::default())
     }
 }
 
@@ -95,7 +94,7 @@ impl ConnCfg {
                 io::ErrorKind::Other,
                 format!(
                     "couldn't connect: bad compute config, \
-                        ports and hosts entries' count does not match: {:?}",
+                     ports and hosts entries' count does not match: {:?}",
                     self.0
                 ),
             ));
@@ -131,8 +130,8 @@ impl ConnCfg {
 pub struct PostgresConnection {
     /// Socket connected to a compute node.
     pub stream: TcpStream,
-    /// PostgreSQL version of this instance.
-    pub version: String,
+    /// PostgreSQL connection parameters.
+    pub params: std::collections::HashMap<String, String>,
 }
 
 impl ConnCfg {
@@ -156,6 +155,7 @@ impl ConnCfg {
             self.0.application_name(app_name);
         }
 
+        // TODO: This is especially ugly...
         if let Some(replication) = params.get("replication") {
             use tokio_postgres::config::ReplicationMode;
             match replication {
@@ -172,22 +172,24 @@ impl ConnCfg {
         // TODO: extend the list of the forwarded startup parameters.
         // Currently, tokio-postgres doesn't allow us to pass
         // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
 
-        let (socket_addr, mut stream) = self
-            .connect_raw()
-            .await
-            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;
-
-        // TODO: establish a secure connection to the DB
-        let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?;
-        let version = conn
-            .parameter("server_version")
-            .ok_or(ConnectionError::FailedToFetchPgVersion)?
-            .into();
-
+        // TODO: establish a secure connection to the DB.
+        let (socket_addr, mut stream) = self.connect_raw().await?;
+        let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
         info!("connected to user's compute node at {socket_addr}");
+
+        // This is very ugly but as of now there's no better way to
+        // extract the connection parameters from tokio-postgres' connection.
+        // TODO: solve this problem in a more elegant manner (e.g. the new library).
+        let params = connection.parameters;
+
+        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
+        // Yet another reason to rework the connection establishing code.
         let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
-        let db = PostgresConnection { stream, version };
+        let db = PostgresConnection { stream, params };
 
         Ok((db, cancel_closure))
     }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index da3cb144e3..713388c625 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -255,15 +255,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
         // Note that we do this only (for the most part) after we've connected
         // to a compute (see above) which performs its own authentication.
         if !auth_result.reported_auth_ok {
-            stream
-                .write_message_noflush(&Be::AuthenticationOk)?
-                .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+            stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
+        // Forward all postgres connection params to the client.
+        // Right now the implementation is very hacky and inefficent (ideally,
+        // we don't need an intermediate hashmap), but at least it should be correct.
+        for (name, value) in &db.params {
+            // TODO: Theoretically, this could result in a big pile of params...
+            stream.write_message_noflush(&Be::ParameterStatus {
+                name: name.as_bytes(),
+                value: value.as_bytes(),
+            })?;
         }
 
         stream
-            .write_message_noflush(&BeMessage::ParameterStatus(
-                BeParameterStatusMessage::ServerVersion(&db.version),
-            ))?
             .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
             .write_message(&BeMessage::ReadyForQuery)
             .await?;
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 24fbc57b99..2f023844d0 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -139,7 +139,7 @@ async fn dummy_proxy(
 
     stream
         .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?
+        .write_message_noflush(&Be::CLIENT_ENCODING)?
         .write_message(&BeMessage::ReadyForQuery)
         .await?;
 
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index d11ef1711a..72a51ec443 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -20,8 +20,8 @@ hyper = "0.14"
 nix = "0.25"
 once_cell = "1.13.0"
 parking_lot = "0.12.1"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 regex = "1.4.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -29,7 +29,7 @@ serde_with = "2.0"
 signal-hook = "0.3.10"
 thiserror = "1"
 tokio = { version = "1.17", features = ["macros", "fs"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"
 url = "2.2.2"
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index eab9505fbb..4d2b63d360 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -122,3 +122,33 @@ def test_auth_errors(static_proxy: NeonProxy):
     # Finally, check that the user can connect
     with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
         pass
+
+
+def test_forward_params_to_client(static_proxy: NeonProxy):
+    # A subset of parameters (GUCs) which postgres
+    # sends to the client during connection setup.
+    # Unfortunately, `GUC_REPORT` can't be queried.
+    # Proxy *should* forward them, otherwise client library
+    # might misbehave (e.g. parse timestamps incorrectly).
+    reported_params_subset = [
+        "client_encoding",
+        "integer_datetimes",
+        "is_superuser",
+        "server_encoding",
+        "server_version",
+        "session_authorization",
+        "standard_conforming_strings",
+    ]
+
+    query = """
+        select name, setting
+        from pg_catalog.pg_settings
+        where name = any(%s)
+    """
+
+    with static_proxy.connect(options="project=irrelevant") as conn:
+        with conn.cursor() as cur:
+            cur.execute(query, (reported_params_subset,))
+            for name, value in cur.fetchall():
+                # Check that proxy has forwarded this parameter.
+                assert conn.get_parameter_status(name) == value

From 3514e6e89a2ddd100057f5820a5cfd0f203cd3f3 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 16 Dec 2022 21:14:57 +0100
Subject: [PATCH 095/167] Use neon_nblocks instead of get_cached_relsize
 (#3132)

This prevents us from overwriting all blocks of a relation when we
extend the relation without first caching the size - get_cached_relsize
does not guarantee a correct result when it returns `false`.
---
 pgxn/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 73bf330baf..900f44ca10 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1669,7 +1669,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
 	 * call smgrextend for destination relation n using size of source relation
 	 */
-	get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
+	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
 

From 61194ab2f430df3eb5969477bb1a1861456ec136 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Fri, 16 Dec 2022 21:58:41 +0300
Subject: [PATCH 096/167] Update rust-postgres everywhere

I've rebased[1] Neon's fork of rust-postgres to incorporate
latest upstream changes (including dependabot's fixes),
so we need to advance revs here as well.

[1] https://github.com/neondatabase/rust-postgres/commits/neon
---
 Cargo.lock                             | 53 +++++++++++++++-----------
 Cargo.toml                             |  2 +-
 compute_tools/Cargo.toml               |  4 +-
 control_plane/Cargo.toml               |  2 +-
 libs/postgres_connection/Cargo.toml    |  4 +-
 libs/postgres_ffi/Cargo.toml           |  2 +-
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 +-
 libs/pq_proto/Cargo.toml               |  2 +-
 pageserver/Cargo.toml                  |  8 ++--
 proxy/Cargo.toml                       |  2 +-
 safekeeper/Cargo.toml                  |  6 +--
 11 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1eb27fb0f9..665000746d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -563,6 +563,12 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
 
+[[package]]
+name = "base64"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -1920,7 +1926,7 @@ version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "pem",
  "ring",
  "serde",
@@ -2507,7 +2513,7 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]
 
 [[package]]
@@ -2528,18 +2534,18 @@ dependencies = [
 
 [[package]]
 name = "phf"
-version = "0.10.1"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
+checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
 dependencies = [
  "phf_shared",
 ]
 
 [[package]]
 name = "phf_shared"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
+checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
 dependencies = [
  "siphasher",
 ]
@@ -2612,12 +2618,12 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.2"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+version = "0.19.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "bytes",
  "fallible-iterator",
- "futures",
+ "futures-util",
  "log",
  "tokio",
  "tokio-postgres",
@@ -2626,9 +2632,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
- "base64",
+ "base64 0.20.0",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -2643,8 +2649,8 @@ dependencies = [
 
 [[package]]
 name = "postgres-types"
-version = "0.2.3"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+version = "0.2.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2868,7 +2874,7 @@ dependencies = [
  "anyhow",
  "async-trait",
  "atty",
- "base64",
+ "base64 0.13.1",
  "bstr",
  "bytes",
  "clap 4.0.29",
@@ -3078,7 +3084,7 @@ version = "0.11.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -3261,7 +3267,7 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]
 
 [[package]]
@@ -3542,7 +3548,7 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "chrono",
  "hex",
  "indexmap",
@@ -4009,14 +4015,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+version = "0.7.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
  "fallible-iterator",
- "futures",
+ "futures-channel",
+ "futures-util",
  "log",
  "parking_lot 0.12.1",
  "percent-encoding",
@@ -4109,7 +4116,7 @@ dependencies = [
  "async-stream",
  "async-trait",
  "axum",
- "base64",
+ "base64 0.13.1",
  "bytes",
  "futures-core",
  "futures-util",
@@ -4351,7 +4358,7 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "chunked_transfer",
  "log",
  "native-tls",
@@ -4787,7 +4794,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
 dependencies = [
  "asn1-rs",
- "base64",
+ "base64 0.13.1",
  "data-encoding",
  "der-parser",
  "lazy_static",
diff --git a/Cargo.toml b/Cargo.toml
index 0e098d91ee..927900d5c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,4 +86,4 @@ lto = true
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 6240073cb3..c40d870649 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -12,12 +12,12 @@ futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 notify = "5.0.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 9d9d6a5f11..180508a01a 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,7 +10,7 @@ comfy-table = "6.1"
 git-version = "0.3.5"
 nix = "0.25"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 25db64337d..1924b260fa 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -8,8 +8,8 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 itertools = "0.10.3"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index bafc587e80..59eec3de32 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [dev-dependencies]
 env_logger = "0.9"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 wal_craft = { path = "wal_craft" }
 
 [build-dependencies]
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 3a22e9d789..dd9f82a87a 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -11,7 +11,7 @@ clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index dc38abd64b..76d8fbf28d 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 bytes = "1.0.1"
 pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9a9bb9bf08..24642ca2f7 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,9 +36,9 @@ nix = "0.25"
 num-traits = "0.2.15"
 once_cell = "1.13.0"
 pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
@@ -52,7 +52,7 @@ svg_fmt = "0.4.1"
 tar = "0.4.33"
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.36"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 68004e5fe2..e630b2758d 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -33,7 +33,7 @@ sha2 = "0.10.2"
 socket2 = "0.4.4"
 thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-rustls = "0.23.0"
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 72a51ec443..fbcb3f34f7 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -20,8 +20,8 @@ hyper = "0.14"
 nix = "0.25"
 once_cell = "1.13.0"
 parking_lot = "0.12.1"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1.4.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -29,7 +29,7 @@ serde_with = "2.0"
 signal-hook = "0.3.10"
 thiserror = "1"
 tokio = { version = "1.17", features = ["macros", "fs"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"
 url = "2.2.2"

From 12e6f443dae80f316832cf81d83b4f71eb17bbc9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 18 Dec 2022 00:02:04 +0000
Subject: [PATCH 097/167] test_perf_pgbench: switch to server-side data
 generation (#3058)

To offload the network and reduce its impact, I suggest switching to
server-side data generation for the pgbench initialize workflow.
---
 test_runner/fixtures/benchmark_fixture.py    | 63 ++++++++++++--------
 test_runner/performance/test_perf_pgbench.py |  6 +-
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 27fb0a60b2..b1489b7ab1 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -11,7 +11,7 @@ from datetime import datetime
 from pathlib import Path
 
 # Type-related stuff
-from typing import Callable, ClassVar, Iterator, Optional
+from typing import Callable, ClassVar, Dict, Iterator, Optional
 
 import pytest
 from _pytest.config import Config
@@ -135,23 +135,26 @@ class PgBenchRunResult:
 
 @dataclasses.dataclass
 class PgBenchInitResult:
-    REGEX: ClassVar[re.Pattern] = re.compile(  # type: ignore[type-arg]
-        r"done in (\d+\.\d+) s "
-        r"\("
-        r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
-        r"(?:create tables (\d+\.\d+) s)?(?:, )?"
-        r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
-        r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
-        r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
-        r"\)\."
-    )
+    # Taken from https://github.com/postgres/postgres/blob/REL_15_1/src/bin/pgbench/pgbench.c#L5144-L5171
+    EXTRACTORS: ClassVar[Dict[str, re.Pattern]] = {  # type: ignore[type-arg]
+        "drop_tables": re.compile(r"drop tables (\d+\.\d+) s"),
+        "create_tables": re.compile(r"create tables (\d+\.\d+) s"),
+        "client_side_generate": re.compile(r"client-side generate (\d+\.\d+) s"),
+        "server_side_generate": re.compile(r"server-side generate (\d+\.\d+) s"),
+        "vacuum": re.compile(r"vacuum (\d+\.\d+) s"),
+        "primary_keys": re.compile(r"primary keys (\d+\.\d+) s"),
+        "foreign_keys": re.compile(r"foreign keys (\d+\.\d+) s"),
+        "total": re.compile(r"done in (\d+\.\d+) s"),  # Total time printed by pgbench
+    }
 
-    total: float
+    total: Optional[float]
     drop_tables: Optional[float]
     create_tables: Optional[float]
     client_side_generate: Optional[float]
+    server_side_generate: Optional[float]
     vacuum: Optional[float]
     primary_keys: Optional[float]
+    foreign_keys: Optional[float]
     duration: float
     start_timestamp: int
     end_timestamp: int
@@ -164,25 +167,35 @@ class PgBenchInitResult:
         start_timestamp: int,
         end_timestamp: int,
     ):
-        # Parses pgbench initialize output for default initialization steps (dtgvp)
+        # Parses pgbench initialize output
         # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s).
 
         last_line = stderr.splitlines()[-1]
 
-        if (m := cls.REGEX.match(last_line)) is not None:
-            total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [
-                float(v) for v in m.groups() if v is not None
-            ]
-        else:
+        timings: Dict[str, Optional[float]] = {}
+        last_line_items = re.split(r"\(|\)|,", last_line)
+        for item in last_line_items:
+            for key, regex in cls.EXTRACTORS.items():
+                if (m := regex.match(item.strip())) is not None:
+                    if key in timings:
+                        raise RuntimeError(
+                            f"can't store pgbench results for repeated action `{key}`"
+                        )
+
+                    timings[key] = float(m.group(1))
+
+        if not timings or "total" not in timings:
             raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`")
 
         return cls(
-            total=total,
-            drop_tables=drop_tables,
-            create_tables=create_tables,
-            client_side_generate=client_side_generate,
-            vacuum=vacuum,
-            primary_keys=primary_keys,
+            total=timings["total"],
+            drop_tables=timings.get("drop_tables", 0.0),
+            create_tables=timings.get("create_tables", 0.0),
+            client_side_generate=timings.get("client_side_generate", 0.0),
+            server_side_generate=timings.get("server_side_generate", 0.0),
+            vacuum=timings.get("vacuum", 0.0),
+            primary_keys=timings.get("primary_keys", 0.0),
+            foreign_keys=timings.get("foreign_keys", 0.0),
             duration=duration,
             start_timestamp=start_timestamp,
             end_timestamp=end_timestamp,
@@ -326,8 +339,10 @@ class NeonBenchmarker:
             "drop_tables",
             "create_tables",
             "client_side_generate",
+            "server_side_generate",
             "vacuum",
             "primary_keys",
+            "foreign_keys",
         ]
         for metric in metrics:
             if (value := getattr(result, metric)) is not None:
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 015cc40a72..50e5366c1e 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -15,7 +15,7 @@ from fixtures.utils import get_scale_for_db
 @enum.unique
 class PgBenchLoadType(enum.Enum):
     INIT = "init"
-    SIMPLE_UPDATE = "simple_update"
+    SIMPLE_UPDATE = "simple-update"
     SELECT_ONLY = "select-only"
 
 
@@ -94,7 +94,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
 
     if workload_type == PgBenchLoadType.INIT:
         # Run initialize
-        init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password)
+        init_pgbench(
+            env, ["pgbench", f"-s{scale}", "-i", "-I", "dtGvp", connstr], password=password
+        )
 
     if workload_type == PgBenchLoadType.SIMPLE_UPDATE:
         # Run simple-update workload

From e23d5da51cf69935fad9ae1db83fb07dc6996181 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 10:52:16 +0200
Subject: [PATCH 098/167] Tidy up and add comments to the pageserver startup
 code.

To make it more readable.
---
 pageserver/src/bin/pageserver.rs | 70 ++++++++++++++------------------
 1 file changed, 30 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 345f391e61..47e9382e6d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -201,8 +201,12 @@ fn initialize_config(
 }
 
 fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    // Initialize logging
     logging::init(conf.log_format)?;
+
+    // Print version to the log, and expose it as a prometheus metric too.
     info!("version: {}", version());
+    set_build_info_metric(GIT_VERSION);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
@@ -218,38 +222,37 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         )
     }
 
+    // Create and lock PID file. This ensures that there cannot be more than one
+    // pageserver process running at the same time.
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
     let lock_file =
         utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
     info!("Claimed pid file at {lock_file_path:?}");
 
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
+    // Ensure that the lock file is held even if the main thread of the process panics.
+    // We need to release the lock file only when the process exits.
     std::mem::forget(lock_file);
 
-    // TODO: Check that it looks like a valid repository before going further
+    // Bind the HTTP and libpq ports early, so that if they are in use by some other
+    // process, we error out early.
+    let http_addr = &conf.listen_http_addr;
+    info!("Starting pageserver http handler on {http_addr}");
+    let http_listener = tcp_listener::bind(http_addr)?;
 
-    // bind sockets before daemonizing so we report errors early and do not return until we are listening
-    info!(
-        "Starting pageserver http handler on {}",
-        conf.listen_http_addr
-    );
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
-
-    info!(
-        "Starting pageserver pg protocol handler on {}",
-        conf.listen_pg_addr
-    );
-    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
+    let pg_addr = &conf.listen_pg_addr;
+    info!("Starting pageserver pg protocol handler on {pg_addr}");
+    let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
+    // Install signal handlers
     let signals = signals::install_shutdown_handlers()?;
 
-    // start profiler (if enabled)
+    // Start profiler (if enabled)
     let profiler_guard = profiling::init_profiler(conf);
 
+    // Launch broker client
     WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
 
-    // initialize authentication for incoming connections
+    // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
         AuthType::Trust | AuthType::MD5 => None,
         AuthType::NeonJWT => {
@@ -277,6 +280,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         }
     };
 
+    // Set up remote storage client
     let remote_storage = conf
         .remote_storage_config
         .as_ref()
@@ -284,30 +288,18 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         .transpose()
         .context("Failed to init generic remote storage")?;
 
-    let (init_result_sender, init_result_receiver) =
-        std::sync::mpsc::channel::<anyhow::Result<()>>();
-    let storage_for_spawn = remote_storage.clone();
-    let _handler = BACKGROUND_RUNTIME.spawn(async move {
-        let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
-        init_result_sender.send(result)
-    });
-    match init_result_receiver.recv() {
-        Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
-        Err(_sender_dropped_err) => {
-            anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
-        }
-    }
+    // Scan the local 'tenants/' directory and start loading the tenants
+    BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
 
-    // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
-    // bind before launching separate thread so the error reported before startup exits
-
-    // Create a Service from the router above to handle incoming requests.
+    // Start up the service to handle HTTP mgmt API request. We created the
+    // listener earlier already.
     {
         let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
 
-        let router = http::make_router(conf, auth.clone(), remote_storage)?;
-        let service =
-            utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
+        let router = http::make_router(conf, auth.clone(), remote_storage)?
+            .build()
+            .map_err(|err| anyhow!(err))?;
+        let service = utils::http::RouterService::new(router).unwrap();
         let server = hyper::Server::from_tcp(http_listener)?
             .serve(service)
             .with_graceful_shutdown(task_mgr::shutdown_watcher());
@@ -327,7 +319,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
-    // for each connection.
+    // for each connection. We created the listener earlier already.
     task_mgr::spawn(
         COMPUTE_REQUEST_RUNTIME.handle(),
         TaskKind::LibpqEndpointListener,
@@ -340,8 +332,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         },
     );
 
-    set_build_info_metric(GIT_VERSION);
-
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
         Signal::Quit => {

From c785a516aa6230a36fe48c9a96eb2a802125d5ad Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Dec 2022 13:02:42 +0100
Subject: [PATCH 099/167] remove TimelineInfo.{Remote,Local} along with their
 types

follow-up of https://github.com/neondatabase/neon/pull/2615
which is neon.git: 538876650a0c303aeae4fac71336a3d62aa6da28

must be deployed after cloud.git change
https://github.com/neondatabase/cloud/issues/3232

fixes https://github.com/neondatabase/neon/issues/3041
---
 libs/pageserver_api/src/models.rs    | 23 ---------------------
 pageserver/src/http/openapi_spec.yml | 31 ----------------------------
 pageserver/src/http/routes.rs        | 16 ++------------
 3 files changed, 2 insertions(+), 68 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e49b7051d2..586ce2a73a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -203,29 +203,6 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
-
-    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
-    // compatility with older clients.
-    pub local: LocalTimelineInfo,
-    pub remote: RemoteTimelineInfo,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct LocalTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_lsn: Option<Lsn>,
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
-    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RemoteTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub remote_consistent_lsn: Option<Lsn>,
 }
 
 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b372410c0d..67cf4ea326 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -795,37 +795,6 @@ components:
         latest_gc_cutoff_lsn:
           type: string
           format: hex
-
-        # These 'local' and 'remote' fields just duplicate some of the fields
-        # above. They are kept for backwards-compatibility. They can be removed,
-        # when the control plane has been updated to look at the above fields
-        # directly.
-        local:
-          $ref: "#/components/schemas/LocalTimelineInfo"
-        remote:
-          $ref: "#/components/schemas/RemoteTimelineInfo"
-
-    LocalTimelineInfo:
-      type: object
-      properties:
-        ancestor_timeline_id:
-          type: string
-          format: hex
-        ancestor_lsn:
-          type: string
-          format: hex
-        current_logical_size:
-          type: integer
-        current_physical_size:
-          type: integer
-    RemoteTimelineInfo:
-      type: object
-      required:
-        - remote_consistent_lsn
-      properties:
-        remote_consistent_lsn:
-          type: string
-          format: hex
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0ef555c4aa..40d2a0e0ef 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,8 +7,8 @@ use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
 use super::models::{
-    LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
-    TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::tenant::Timeline;
@@ -147,18 +147,6 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         pg_version: timeline.pg_version,
 
         state,
-
-        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
-        // with the control plane.
-        local: LocalTimelineInfo {
-            ancestor_timeline_id,
-            ancestor_lsn,
-            current_logical_size,
-            current_physical_size,
-        },
-        remote: RemoteTimelineInfo {
-            remote_consistent_lsn: Some(remote_consistent_lsn),
-        },
     };
     Ok(info)
 }

From ee2b5dc9ac0bebfa3d10da0ab0418c576581d949 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 12:15:23 +0100
Subject: [PATCH 100/167] [1/4] initial logical size calculation: if it fails,
 retry on next call

Before this patch, if the task fails, we would not reset
self.initial_size_computation_started.
So, if it fails, we will return the approximate value forever.

In practice, it probably never failed because the local filesystem
is quite reliable.

But with on-demand download, the logical size calculation may need
to download layers, which is more likely to fail at times.
There will be internal retires with a timeout, but eventually,
the downloads will give up.
We want to retry in those cases.

While we're at it, also change the handling of the timeline state
watch so that we treat it as an error. Most likely, we'll not be
called again, but if we are, retrying is the right thing.
---
 pageserver/src/tenant/timeline.rs | 146 +++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 55 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cc6583dcf6..b61ef09c46 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -15,7 +15,7 @@ use std::collections::HashMap;
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::{Duration, Instant, SystemTime};
 
@@ -176,7 +176,7 @@ pub struct Timeline {
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
-    initial_size_computation_started: AtomicBool,
+    initial_size_computation_state: Mutex<InitialLogicalSizeComputationState>,
 
     /// Information about the last processed message by the WAL receiver,
     /// or None if WAL receiver has not received anything for this timeline
@@ -189,6 +189,14 @@ pub struct Timeline {
     state: watch::Sender<TimelineState>,
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum InitialLogicalSizeComputationState {
+    NotStarted,
+    Running,
+    FailedWillRetryNextTime,
+    Success,
+}
+
 /// Internal structure to hold all data needed for logical size calculation.
 /// Calculation consists of two parts:
 /// 1.  Initial size calculation. That might take a long time, because it requires
@@ -804,7 +812,9 @@ impl Timeline {
                 // initial logical size is 0.
                 LogicalSize::empty_initial()
             },
-            initial_size_computation_started: AtomicBool::new(false),
+            initial_size_computation_state: Mutex::new(
+                InitialLogicalSizeComputationState::NotStarted,
+            ),
             partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
             repartition_threshold: 0,
 
@@ -1221,59 +1231,85 @@ impl Timeline {
     }
 
     fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        // Atomically check if the timeline size calculation had already started.
-        // If the flag was not already set, this sets it.
-        if !self
-            .initial_size_computation_started
-            .swap(true, AtomicOrdering::SeqCst)
-        {
-            // We need to start the computation task.
-            let self_clone = Arc::clone(self);
-            task_mgr::spawn(
-                task_mgr::BACKGROUND_RUNTIME.handle(),
-                task_mgr::TaskKind::InitialLogicalSizeCalculation,
-                Some(self.tenant_id),
-                Some(self.timeline_id),
-                "initial size calculation",
-                false,
-                async move {
-                    let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
-                    let self_calculation = Arc::clone(&self_clone);
-                    tokio::select! {
-                        calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
-                            let calculated_size = calculation_result
-                                .context("Failed to spawn calculation result task")?
-                                .context("Failed to calculate logical size")?;
-                            match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
-                                Ok(()) => info!("Successfully calculated initial logical size"),
-                                Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
-                            }
-                            Ok(())
-                        },
-                        new_event = async {
-                            loop {
-                                match timeline_state_updates.changed().await {
-                                    Ok(()) => {
-                                        let new_state = *timeline_state_updates.borrow();
-                                        match new_state {
-                                            // we're running this job for active timelines only
-                                            TimelineState::Active => continue,
-                                            TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
-                                        }
-                                    }
-                                    Err(_sender_dropped_error) => return None,
-                                }
-                            }
-                        } => {
-                            match new_event {
-                                Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
-                                None => info!("Timeline dropped state updates sender, stopping init size calculation"),
-                            }
-                            Ok(())
-                        },
+        use InitialLogicalSizeComputationState::*;
+        let mut guard = self.initial_size_computation_state.lock().unwrap();
+        match *guard {
+            Running | Success => return,
+            NotStarted | FailedWillRetryNextTime => *guard = Running,
+        }
+        drop(guard);
+        // We need to start the computation task.
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "initial size calculation",
+            false,
+            async move {
+                let res = self_clone
+                    .initial_logical_size_calculation_task(init_lsn)
+                    .await;
+                // task_mgr will log the result
+                let new_state = match res {
+                    Ok(_) => Success,
+                    Err(_) => FailedWillRetryNextTime,
+                };
+                let mut state = self_clone.initial_size_computation_state.lock().unwrap();
+                if *state != Running {
+                    // Should be unreachable, but no reason to crash the pageserver. Don't touch anything.
+                    error!("expecting initial size computation task to be in state {Running:?}, got {state:?}")
+                } else {
+                    *state = new_state;
+                }
+                res
+            },
+        );
+    }
+
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    async fn initial_logical_size_calculation_task(
+        self: &Arc<Self>,
+        init_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let self_calculation = Arc::clone(self);
+        tokio::select! {
+            calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
+                let calculated_size = calculation_result
+                    .context("Failed to spawn calculation result task")?
+                    .context("Failed to calculate logical size")?;
+                match self.current_logical_size.initial_logical_size.set(calculated_size) {
+                    Ok(()) => (),
+                    Err(existing_size) => {
+                        // This shouldn't happen because we use self.initial_size_computation_running to ensure exlusivity here.
+                        // But if it happens, just complain & report success so there are no further retries.
+                        error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                     }
-                }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
-            );
+                }
+                Ok(())
+            },
+            new_event = async {
+                loop {
+                    match timeline_state_updates.changed().await {
+                        Ok(()) => {
+                            let new_state = *timeline_state_updates.borrow();
+                            match new_state {
+                                // we're running this job for active timelines only
+                                TimelineState::Active => continue,
+                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
+                            }
+                        }
+                        Err(_sender_dropped_error) => return None,
+                    }
+                }
+            } => {
+                match new_event {
+                    Some(new_state) => anyhow::bail!("aborted because timeline became inactive (new state: {new_state:?})"),
+                    None => anyhow::bail!("aborted because state watch was dropped"), // can't happen, the sender is not dropped as long as the Timeline exists
+                }
+            },
         }
     }
 

From 40a3d508834e60860f8888ab68a357eba178c138 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 15:17:13 +0100
Subject: [PATCH 101/167] [2/4] add test to show that tenant detach makes us
 leak running size calculation task

---
 pageserver/src/tenant/timeline.rs         | 21 ++++++
 test_runner/regress/test_timeline_size.py | 84 ++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b61ef09c46..e957878472 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1321,6 +1321,27 @@ impl Timeline {
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
         );
+        // These failpoints are used by python tests to ensure that we don't delete
+        // the timeline while the logical size computation is ongoing.
+        // The first failpoint is used to make this function pause.
+        // Then the python test initiates timeline delete operation in a thread.
+        // It waits for a few seconds, then arms the second failpoint and disables
+        // the first failpoint. The second failpoint prints an error if the timeline
+        // delete code has deleted the on-disk state while we're still running here.
+        // It shouldn't do that. If it does it anyway, the error will be caught
+        // by the test suite, highlighting the problem.
+        fail::fail_point!("timeline-calculate-logical-size-pause");
+        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
+            if !self
+                .conf
+                .metadata_path(self.timeline_id, self.tenant_id)
+                .exists()
+            {
+                error!("timeline-calculate-logical-size-pre metadata file does not exist")
+            }
+            // need to return something
+            Ok(0)
+        });
         let timer = if up_to_lsn == self.initdb_lsn {
             if let Some(size) = self.current_logical_size.initialized_size() {
                 if size != 0 {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4b70c2ea18..e881608a44 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,6 +1,8 @@
 import math
+import queue
 import random
 import re
+import threading
 import time
 from contextlib import closing
 from pathlib import Path
@@ -11,6 +13,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    PageserverApiException,
     PageserverHttpClient,
     PgBin,
     PortDistributor,
@@ -19,7 +22,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import get_timeline_dir_size
+from fixtures.utils import get_timeline_dir_size, wait_until
 
 
 def test_timeline_size(neon_simple_env: NeonEnv):
@@ -213,6 +216,85 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value"
 
 
+def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    # load in some data
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    pg.safe_psql_many(
+        [
+            "CREATE TABLE foo (x INTEGER)",
+            "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
+        ]
+    )
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pg.stop()
+
+    # restart with failpoint inside initial size calculation task
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
+    wait_until(30, 1, tenant_active)
+
+    # kick off initial size calculation task (the response we get here is the estimated size)
+    def assert_size_calculation_not_done():
+        details = client.timeline_detail(
+            tenant_id, timeline_id, include_non_incremental_logical_size=True
+        )
+        assert details["current_logical_size"] != details["current_logical_size_non_incremental"]
+
+    assert_size_calculation_not_done()
+    # ensure we're really stuck
+    time.sleep(5)
+    assert_size_calculation_not_done()
+
+    log.info(
+        "try to delete the timeline, this should cancel size computation tasks and wait for them to finish"
+    )
+    env.pageserver.allowed_errors.append(
+        f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
+    )
+    delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
+
+    def delete_timeline_thread_fn():
+        try:
+            client.tenant_detach(tenant_id)
+            delete_timeline_success.put(True)
+        except PageserverApiException:
+            delete_timeline_success.put(False)
+            raise
+
+    delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn)
+    delete_timeline_thread.start()
+    # give it some time to settle in the state where it waits for size computation task
+    time.sleep(5)
+    assert (
+        not delete_timeline_success.empty()
+    ), "delete timeline should be stuck waiting for size computation task"
+
+    log.info(
+        "resume the size calculation. The failpoint checks that the timeline directory still exists."
+    )
+    client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return"))
+    client.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+    log.info("wait for delete timeline thread to finish and assert that it succeeded")
+    assert delete_timeline_success.get()
+
+    # if the implementation is incorrect, the teardown would complain about an error log
+    # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
+
+
 def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     env = neon_simple_env
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")

From 38ebd6e7a00bb37b221095158e3b79c6a33ba5b5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 15:16:25 +0100
Subject: [PATCH 102/167] [3/4] make initial size estimation task sensitive to
 task_mgr shutdown requests

This exacerbates the problem pointed out in the previous commit.
Why? Because with this patch, deleting a timeline also exposes the issue.

Extend the test to expose the problem.
---
 pageserver/src/tenant/timeline.rs         |  3 +++
 test_runner/regress/test_timeline_size.py | 27 +++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e957878472..b7f12609e6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1290,6 +1290,9 @@ impl Timeline {
                 }
                 Ok(())
             },
+            _ = task_mgr::shutdown_watcher() => {
+                anyhow::bail!("aborted because task_mgr shutdown requested");
+            }
             new_event = async {
                 loop {
                     match timeline_state_updates.changed().await {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index e881608a44..38660cefac 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -9,6 +9,7 @@ from pathlib import Path
 
 import psycopg2.errors
 import psycopg2.extras
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -216,7 +217,10 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value"
 
 
-def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"])
+def test_timeline_initial_logical_size_calculation_cancellation(
+    neon_env_builder: NeonEnvBuilder, deletion_method: str
+):
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
@@ -259,16 +263,20 @@ def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder
     assert_size_calculation_not_done()
 
     log.info(
-        "try to delete the timeline, this should cancel size computation tasks and wait for them to finish"
-    )
-    env.pageserver.allowed_errors.append(
-        f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
+        f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
     )
+    if deletion_method == "timeline_delete":
+        env.pageserver.allowed_errors.append(
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
+        )
     delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
 
     def delete_timeline_thread_fn():
         try:
-            client.tenant_detach(tenant_id)
+            if deletion_method == "tenant_detach":
+                client.tenant_detach(tenant_id)
+            elif deletion_method == "timeline_delete":
+                client.timeline_delete(tenant_id, timeline_id)
             delete_timeline_success.put(True)
         except PageserverApiException:
             delete_timeline_success.put(False)
@@ -278,9 +286,10 @@ def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder
     delete_timeline_thread.start()
     # give it some time to settle in the state where it waits for size computation task
     time.sleep(5)
-    assert (
-        not delete_timeline_success.empty()
-    ), "delete timeline should be stuck waiting for size computation task"
+    if not delete_timeline_success.empty():
+        assert (
+            False
+        ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}"
 
     log.info(
         "resume the size calculation. The failpoint checks that the timeline directory still exists."

From 7db018e1477e721802d603496f4722012423aa7a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 17:20:38 +0100
Subject: [PATCH 103/167] [4/4] the fix: do not leak spawn_blocking() tasks
 from logical size calculation code

- Refactor logical_size_calculation_task, moving the pieces that are
  specific to try_spawn_size_init_task into that function.
  This allows us to spawn additional size calculation tasks that are not
  init size calculation tasks.

  - As part of this refactoring, stop logging cancellations as errors.
    They are part of regular operations.
    Logging them as errors was inadvertently introduced in earlier commit

      427c1b2e9661161439e65aabc173d695cfc03ab4
      initial logical size calculation: if it fails, retry on next call

- Change tenant size model request code to spawn task_mgr tasks using
  the refactored logical_size_calculation_task function.
  Using a task_mgr task ensures that the calculation cannot outlive
  the timeline.
  - There are presumably still some subtle race conditions if a size
    requests comes in at exactly the same time as a detach / delete
    request.
  - But that's the concern of diferent area of the code (e.g., tenant_mgr)
    and requires holistic solutions, such as the proposed TenantGuard.

- Make size calculation cancellable using CancellationToken.
  This is more of a cherry on top.
  NB: the test code doesn't use this because we _must_ return from
  the failpoint, because the failpoint lib doesn't allow to just
  continue execution in combination with executing the closure.

This commit fixes the tests introduced earlier in this patch series.
---
 pageserver/src/http/routes.rs             |   9 +-
 pageserver/src/pgdatadir_mapping.rs       |  22 ++-
 pageserver/src/tenant/size.rs             |  59 ++++---
 pageserver/src/tenant/timeline.rs         | 202 ++++++++++++++--------
 test_runner/regress/test_timeline_size.py |   4 -
 5 files changed, 187 insertions(+), 109 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 40d2a0e0ef..68a26b8098 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,6 +4,7 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use remote_storage::GenericRemoteStorage;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use super::models::{
@@ -86,8 +87,14 @@ fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
     let mut info = build_timeline_info_common(timeline)?;
     if include_non_incremental_logical_size {
+        // XXX we should be using spawn_ondemand_logical_size_calculation here.
+        // Otherwise, if someone deletes the timeline / detaches the tenant while
+        // we're executing this function, we will outlive the timeline on-disk state.
         info.current_logical_size_non_incremental =
-            Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
+            Some(timeline.get_current_logical_size_non_incremental(
+                info.last_record_lsn,
+                CancellationToken::new(),
+            )?);
     }
     if include_non_incremental_physical_size {
         info.current_physical_size_non_incremental =
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0e334a63df..797ee9f436 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -19,6 +19,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -33,6 +34,14 @@ pub enum LsnForTimestamp {
     NoData(Lsn),
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum CalculateLogicalSizeError {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -376,14 +385,21 @@ impl Timeline {
     ///
     /// Only relation blocks are counted currently. That excludes metadata,
     /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
+    pub fn get_current_logical_size_non_incremental(
+        &self,
+        lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> std::result::Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
+                if cancel.is_cancelled() {
+                    return Err(CalculateLogicalSizeError::Cancelled);
+                }
                 let relsize_key = rel_size_to_key(rel);
                 let mut buf = self.get(relsize_key, lsn)?;
                 let relsize = buf.get_u32_le();
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 24d9b2a10e..597461ce29 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use anyhow::Context;
+use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+
 use super::Tenant;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -212,11 +215,30 @@ pub(super) async fn gather_inputs(
     let mut have_any_error = false;
 
     while let Some(res) = joinset.join_next().await {
-        // each of these come with Result<Result<_, JoinError>, JoinError>
+        // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
-        let res = res.and_then(|inner| inner);
         match res {
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
+                have_any_error = true;
+            }
+            Ok(Err(recv_result_error)) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("failed to receive logical size query result: {recv_result_error:#}");
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
 
                 logical_size_cache.insert((timeline.timeline_id, lsn), size);
@@ -228,21 +250,6 @@ pub(super) async fn gather_inputs(
                     command: Command::Update(size),
                 });
             }
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
-                warn!(
-                    timeline_id=%timeline.timeline_id,
-                    "failed to calculate logical size at {lsn}: {error:#}"
-                );
-                have_any_error = true;
-            }
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures, nor should be");
-            }
-            Err(join_error) => {
-                // cannot really do anything, as this panic is likely a bug
-                error!("logical size query panicked: {join_error:#}");
-                have_any_error = true;
-            }
         }
     }
 
@@ -351,7 +358,7 @@ enum LsnKind {
 struct TimelineAtLsnSizeResult(
     Arc<crate::tenant::Timeline>,
     utils::lsn::Lsn,
-    anyhow::Result<u64>,
+    Result<u64, CalculateLogicalSizeError>,
 );
 
 #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
@@ -359,17 +366,15 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
-) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
-    let permit = tokio::sync::Semaphore::acquire_owned(limit)
+) -> Result<TimelineAtLsnSizeResult, RecvError> {
+    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
-    tokio::task::spawn_blocking(move || {
-        let _permit = permit;
-        let size_res = timeline.calculate_logical_size(lsn);
-        TimelineAtLsnSizeResult(timeline, lsn, size_res)
-    })
-    .await
+    let size_res = timeline
+        .spawn_ondemand_logical_size_calculation(lsn)
+        .await?;
+    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
 #[test]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b7f12609e6..3373c52231 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6,8 +6,9 @@ use fail::fail_point;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::models::TimelineState;
-use tokio::sync::watch;
+use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio::task::spawn_blocking;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use std::cmp::{max, min, Ordering};
@@ -36,9 +37,9 @@ use crate::tenant::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::TimelineMetrics;
-use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
 use crate::tenant_config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 
@@ -176,7 +177,6 @@ pub struct Timeline {
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
-    initial_size_computation_state: Mutex<InitialLogicalSizeComputationState>,
 
     /// Information about the last processed message by the WAL receiver,
     /// or None if WAL receiver has not received anything for this timeline
@@ -189,14 +189,6 @@ pub struct Timeline {
     state: watch::Sender<TimelineState>,
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-enum InitialLogicalSizeComputationState {
-    NotStarted,
-    Running,
-    FailedWillRetryNextTime,
-    Success,
-}
-
 /// Internal structure to hold all data needed for logical size calculation.
 /// Calculation consists of two parts:
 /// 1.  Initial size calculation. That might take a long time, because it requires
@@ -210,6 +202,8 @@ struct LogicalSize {
     ///
     /// NOTE: initial size is not a constant and will change between restarts.
     initial_logical_size: OnceCell<u64>,
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    initial_size_computation: Arc<tokio::sync::Semaphore>,
     /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
     initial_part_end: Option<Lsn>,
     /// All other size changes after startup, combined together.
@@ -260,6 +254,8 @@ impl LogicalSize {
     fn empty_initial() -> Self {
         Self {
             initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
         }
@@ -268,6 +264,7 @@ impl LogicalSize {
     fn deferred_initial(compute_to: Lsn) -> Self {
         Self {
             initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
         }
@@ -812,9 +809,6 @@ impl Timeline {
                 // initial logical size is 0.
                 LogicalSize::empty_initial()
             },
-            initial_size_computation_state: Mutex::new(
-                InitialLogicalSizeComputationState::NotStarted,
-            ),
             partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
             repartition_threshold: 0,
 
@@ -1231,13 +1225,21 @@ impl Timeline {
     }
 
     fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        use InitialLogicalSizeComputationState::*;
-        let mut guard = self.initial_size_computation_state.lock().unwrap();
-        match *guard {
-            Running | Success => return,
-            NotStarted | FailedWillRetryNextTime => *guard = Running,
-        }
-        drop(guard);
+        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
+            .try_acquire_owned()
+        {
+            Ok(permit) => permit,
+            Err(TryAcquireError::NoPermits) => {
+                // computation already ongoing or finished with success
+                return;
+            }
+            Err(TryAcquireError::Closed) => unreachable!("we never call close"),
+        };
+        debug_assert!(self
+            .current_logical_size
+            .initial_logical_size
+            .get()
+            .is_none());
         // We need to start the computation task.
         let self_clone = Arc::clone(self);
         task_mgr::spawn(
@@ -1247,79 +1249,131 @@ impl Timeline {
             Some(self.timeline_id),
             "initial size calculation",
             false,
+            // NB: don't log errors here, task_mgr will do that.
             async move {
-                let res = self_clone
-                    .initial_logical_size_calculation_task(init_lsn)
-                    .await;
-                // task_mgr will log the result
-                let new_state = match res {
-                    Ok(_) => Success,
-                    Err(_) => FailedWillRetryNextTime,
+                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                {
+                    Ok(s) => s,
+                    Err(CalculateLogicalSizeError::Cancelled) => {
+                        // Don't make noise, this is a common task.
+                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // because initial_logical_size is still None.
+                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
+                        return Ok(());
+                    }
+                    x @ Err(_) => x.context("Failed to calculate logical size")?,
                 };
-                let mut state = self_clone.initial_size_computation_state.lock().unwrap();
-                if *state != Running {
-                    // Should be unreachable, but no reason to crash the pageserver. Don't touch anything.
-                    error!("expecting initial size computation task to be in state {Running:?}, got {state:?}")
-                } else {
-                    *state = new_state;
-                }
-                res
-            },
-        );
-    }
-
-    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
-    async fn initial_logical_size_calculation_task(
-        self: &Arc<Self>,
-        init_lsn: Lsn,
-    ) -> anyhow::Result<()> {
-        let mut timeline_state_updates = self.subscribe_for_state_updates();
-        let self_calculation = Arc::clone(self);
-        tokio::select! {
-            calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
-                let calculated_size = calculation_result
-                    .context("Failed to spawn calculation result task")?
-                    .context("Failed to calculate logical size")?;
-                match self.current_logical_size.initial_logical_size.set(calculated_size) {
+                match self_clone
+                    .current_logical_size
+                    .initial_logical_size
+                    .set(calculated_size)
+                {
                     Ok(()) => (),
                     Err(existing_size) => {
-                        // This shouldn't happen because we use self.initial_size_computation_running to ensure exlusivity here.
+                        // This shouldn't happen because the semaphore is initialized with 1.
                         // But if it happens, just complain & report success so there are no further retries.
                         error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                     }
                 }
+                // now that `initial_logical_size.is_some()`, reduce permit count to 0
+                // so that we prevent future callers from spawning this task
+                permit.forget();
                 Ok(())
             },
-            _ = task_mgr::shutdown_watcher() => {
-                anyhow::bail!("aborted because task_mgr shutdown requested");
-            }
-            new_event = async {
-                loop {
-                    match timeline_state_updates.changed().await {
-                        Ok(()) => {
-                            let new_state = *timeline_state_updates.borrow();
-                            match new_state {
-                                // we're running this job for active timelines only
-                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
+        );
+    }
+
+    pub fn spawn_ondemand_logical_size_calculation(
+        self: &Arc<Self>,
+        lsn: Lsn,
+    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
+        let (sender, receiver) = oneshot::channel();
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "ondemand logical size calculation",
+            false,
+            async move {
+                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let _ = sender.send(res).ok();
+                Ok(()) // Receiver is responsible for handling errors
+            },
+        );
+        receiver
+    }
+
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    async fn logical_size_calculation_task(
+        self: &Arc<Self>,
+        init_lsn: Lsn,
+    ) -> Result<u64, CalculateLogicalSizeError> {
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let self_calculation = Arc::clone(self);
+        let cancel = CancellationToken::new();
+
+        let calculation = async {
+            let cancel = cancel.child_token();
+            spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn, cancel))
+                .await
+                .context("Failed to spawn calculation result task")?
+        };
+        let timeline_state_cancellation = async {
+            loop {
+                match timeline_state_updates.changed().await {
+                    Ok(()) => {
+                        let new_state = *timeline_state_updates.borrow();
+                        match new_state {
+                            // we're running this job for active timelines only
+                            TimelineState::Active => continue,
+                            TimelineState::Broken
+                            | TimelineState::Stopping
+                            | TimelineState::Suspended => {
+                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
                             }
                         }
-                        Err(_sender_dropped_error) => return None,
+                    }
+                    Err(_sender_dropped_error) => {
+                        // can't happen, the sender is not dropped as long as the Timeline exists
+                        break "aborted because state watch was dropped".to_string();
                     }
                 }
-            } => {
-                match new_event {
-                    Some(new_state) => anyhow::bail!("aborted because timeline became inactive (new state: {new_state:?})"),
-                    None => anyhow::bail!("aborted because state watch was dropped"), // can't happen, the sender is not dropped as long as the Timeline exists
+            }
+        };
+
+        let taskmgr_shutdown_cancellation = async {
+            task_mgr::shutdown_watcher().await;
+            "aborted because task_mgr shutdown requested".to_string()
+        };
+
+        tokio::pin!(calculation);
+        loop {
+            tokio::select! {
+                res = &mut calculation =>  { return res }
+                reason = timeline_state_cancellation => {
+                    debug!(reason = reason, "cancelling calculation");
+                    cancel.cancel();
+                    return calculation.await;
                 }
-            },
+                reason = taskmgr_shutdown_cancellation => {
+                    debug!(reason = reason, "cancelling calculation");
+                    cancel.cancel();
+                    return calculation.await;
+                }
+            }
         }
     }
 
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
+    pub fn calculate_logical_size(
+        &self,
+        up_to_lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
@@ -1360,7 +1414,7 @@ impl Timeline {
         } else {
             self.metrics.logical_size_histo.start_timer()
         };
-        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
+        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn, cancel)?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
         Ok(logical_size)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 38660cefac..523c946a68 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -265,10 +265,6 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     log.info(
         f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
     )
-    if deletion_method == "timeline_delete":
-        env.pageserver.allowed_errors.append(
-            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
-        )
     delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
 
     def delete_timeline_thread_fn():

From 49a211c98a357543ab78a320466787487f9d7ed5 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 19 Dec 2022 15:38:41 +0200
Subject: [PATCH 104/167] Add neon_local test

---
 control_plane/src/bin/neon_local.rs        |  1 +
 test_runner/fixtures/neon_fixtures.py      |  6 ++++++
 test_runner/regress/test_neon_local_cli.py | 10 ++++++++++
 3 files changed, 17 insertions(+)
 create mode 100644 test_runner/regress/test_neon_local_cli.py

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f0c3b983f0..61b9445c6d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -900,6 +900,7 @@ fn cli() -> Command {
     let stop_mode_arg = Arg::new("stop-mode")
         .short('m')
         .value_parser(["fast", "immediate"])
+        .default_value("fast")
         .help("If 'immediate', don't flush repository data at shutdown")
         .required(false)
         .value_name("stop-mode");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3a3ee94425..b3e4809f24 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1742,6 +1742,12 @@ class NeonCli(AbstractNeonCli):
 
         return self.raw_cli(args, check_return_code=check_return_code)
 
+    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
+        return self.raw_cli(["start"], check_return_code=check_return_code)
+
+    def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
+        return self.raw_cli(["stop"], check_return_code=check_return_code)
+
 
 class WalCraft(AbstractNeonCli):
     """
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
new file mode 100644
index 0000000000..6c7cdb6f7f
--- /dev/null
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -0,0 +1,10 @@
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+# Test that neon cli is able to start and stop all processes with the user defaults.
+# def test_neon_cli_basics(neon_simple_env: NeonEnv):
+def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init()
+
+    env.neon_cli.start()
+    env.neon_cli.stop()

From 9ddd1d75225afed164f79809e1575121409ed69d Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 19 Dec 2022 16:41:26 +0200
Subject: [PATCH 105/167] Stop all storage nodes on startup failure

---
 control_plane/src/bin/neon_local.rs | 48 ++++++++++++++++-------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 61b9445c6d..53fd3100c7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -747,7 +747,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
     if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
         Ok(SafekeeperNode::from_env(env, node))
     } else {
-        bail!("could not find safekeeper '{}'", id)
+        bail!("could not find safekeeper {id}")
     }
 }
 
@@ -806,22 +806,22 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }
 
 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    broker::start_broker_process(env)?;
-    let pageserver = PageServerNode::from_env(env);
-
     // Postgres nodes are not started automatically
 
+    broker::start_broker_process(env)?;
+
+    let pageserver = PageServerNode::from_env(env);
     if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver start failed: {e}");
-        try_stop_storage_broker_process(env);
+        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
+        try_stop_all(env, true);
         exit(1);
     }
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start() {
-            eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
-            try_stop_storage_broker_process(env);
+            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
+            try_stop_all(env, false);
             exit(1);
         }
     }
@@ -832,35 +832,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
+    try_stop_all(env, immediate);
+
+    Ok(())
+}
+
+fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     let pageserver = PageServerNode::from_env(env);
 
     // Stop all compute nodes
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (_k, node) in cplane.nodes {
-        if let Err(e) = node.stop(false) {
-            eprintln!("postgres stop failed: {}", e);
+    match ComputeControlPlane::load(env.clone()) {
+        Ok(cplane) => {
+            for (_k, node) in cplane.nodes {
+                if let Err(e) = node.stop(false) {
+                    eprintln!("postgres stop failed: {e:#}");
+                }
+            }
+        }
+        Err(e) => {
+            eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}")
         }
     }
 
     if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver stop failed: {}", e);
+        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
     }
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.stop(immediate) {
-            eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
+            eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e);
         }
     }
 
-    try_stop_storage_broker_process(env);
-
-    Ok(())
-}
-
-fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
     if let Err(e) = broker::stop_broker_process(env) {
-        eprintln!("neon broker stop failed: {e}");
+        eprintln!("neon broker stop failed: {e:#}");
     }
 }
 

From 3735aece562902ec3ff045a0f4e4f70662090bff Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 19 Dec 2022 16:42:01 +0200
Subject: [PATCH 106/167] Safekeeper: Always use workdir as a full path

---
 safekeeper/src/bin/safekeeper.rs | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 275253d1d4..5ad88276e8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -129,17 +129,22 @@ fn main() -> anyhow::Result<()> {
     logging::init(LogFormat::from_config(&args.log_format)?)?;
     info!("version: {GIT_VERSION}");
 
+    let args_workdir = &args.datadir;
+    let workdir = args_workdir.canonicalize().with_context(|| {
+        format!("Failed to get the absolute path for input workdir {args_workdir:?}")
+    })?;
+
     // Change into the data directory.
-    std::env::set_current_dir(&args.datadir)?;
+    std::env::set_current_dir(&workdir)?;
 
     // Set or read our ID.
-    let id = set_id(&args.datadir, args.id.map(NodeId))?;
+    let id = set_id(&workdir, args.id.map(NodeId))?;
     if args.init {
         return Ok(());
     }
 
     let conf = SafeKeeperConf {
-        workdir: args.datadir,
+        workdir,
         my_id: id,
         listen_pg_addr: args.listen_pg,
         listen_http_addr: args.listen_http,
@@ -308,7 +313,8 @@ fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
                 } else {
                     bail!("safekeeper id is not specified");
                 };
-                let mut f = File::create(&id_file_path)?;
+                let mut f = File::create(&id_file_path)
+                    .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?;
                 f.write_all(my_id.to_string().as_bytes())?;
                 f.sync_all()?;
                 info!("initialized safekeeper id {}", my_id);

From 39f58038d1a03fd309ed0494b9cbcc1bef99bbef Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 23:58:24 +0200
Subject: [PATCH 107/167] Don't upload index file in compaction, if there was
 nothing to do. (#3149)

This splits the storage_sync2::schedule_index_file into two (public)
functions:
1. `schedule_index_upload_for_metadata_update`, for when the metadata
(e.g. disk_consistent_lsn or last_gc_cutoff) has changed, and

2. `schedule_index_upload_for_file_changes`, for when layer file uploads
or deletions have been scheduled.

We now keep track of whether there have been any uploads or deletions
since the last index-file upload, and skip the upload in
`schedule_index_upload_for_file_changes` if there haven't been any
changes. That allows us to call the function liberally in timeline.rs,
whenever layer file uploads or deletions might've been scheduled,
without starting a lot of unnecessary index file uploads.

GC was covered earlier by commit c262390214, but that missed that we
have the same problem with compaction.
---
 pageserver/src/storage_sync2.rs           | 103 ++++++++++++++++------
 pageserver/src/storage_sync2/index.rs     |  14 ++-
 pageserver/src/tenant/timeline.rs         |  29 ++++--
 test_runner/regress/test_gc_aggressive.py |   5 ++
 4 files changed, 114 insertions(+), 37 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 89bbc34227..14763985ab 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -32,7 +32,8 @@
 //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
 //!
 //! - [`RemoteTimelineClient::schedule_layer_file_upload`]  when we've created a new layer file.
-//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
 //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
 //!
 //! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -290,6 +291,10 @@ struct UploadQueueInitialized {
     /// in-progress and queued operations
     latest_files: HashMap<LayerFileName, LayerFileMetadata>,
 
+    /// How many file uploads or deletions been scheduled, since the
+    /// last (scheduling of) metadata index upload?
+    latest_files_changes_since_metadata_upload_scheduled: u64,
+
     /// Metadata stored in the remote storage, taking into account all
     /// in-progress and queued operations.
     /// DANGER: do not return to outside world, e.g., safekeepers.
@@ -339,6 +344,7 @@ impl UploadQueue {
         let state = UploadQueueInitialized {
             // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
             latest_files: HashMap::new(),
+            latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
             // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
             // safekeepers from garbage-collecting anything.
@@ -385,6 +391,7 @@ impl UploadQueue {
 
         let state = UploadQueueInitialized {
             latest_files: files,
+            latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part_metadata.clone(),
             last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
             // what follows are boring default initializations
@@ -558,7 +565,9 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
             if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
-                upgraded.merge(&new_metadata);
+                if upgraded.merge(&new_metadata) {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
                 // If we don't do an index file upload inbetween here and restart,
                 // the value will go back down after pageserver restart, since we will
                 // have lost this data point.
@@ -583,14 +592,20 @@ impl RemoteTimelineClient {
     //
 
     ///
-    /// Launch an index-file upload operation in the background.
+    /// Launch an index-file upload operation in the background, with
+    /// updated metadata.
     ///
     /// The upload will be added to the queue immediately, but it
     /// won't be performed until all previosuly scheduled layer file
     /// upload operations have completed successfully.  This is to
     /// ensure that when the index file claims that layers X, Y and Z
-    /// exist in remote storage, they really do.
-    pub fn schedule_index_upload(
+    /// exist in remote storage, they really do. To wait for the upload
+    /// to complete, use `wait_completion`.
+    ///
+    /// If there were any changes to the list of files, i.e. if any
+    /// layer file uploads were scheduled, since the last index file
+    /// upload, those will be included too.
+    pub fn schedule_index_upload_for_metadata_update(
         self: &Arc<Self>,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
@@ -601,26 +616,60 @@ impl RemoteTimelineClient {
         // ahead of what's _actually_ on the remote during index upload.
         upload_queue.latest_metadata = metadata.clone();
 
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);
+
+        Ok(())
+    }
+
+    ///
+    /// Launch an index-file upload operation in the background, if necessary.
+    ///
+    /// Use this function to schedule the update of the index file after
+    /// scheduling file uploads or deletions. If no file uploads or deletions
+    /// have been scheduled since the last index file upload, this does
+    /// nothing.
+    ///
+    /// Like schedule_index_upload_for_metadata_update(), this merely adds
+    /// the upload to the upload queue and returns quickly.
+    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
+        }
+
+        Ok(())
+    }
+
+    /// Launch an index-file upload operation in the background (internal function)
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        metadata_bytes: Vec<u8>,
+    ) {
+        info!(
+            "scheduling metadata upload with {} files ({} changed)",
+            upload_queue.latest_files.len(),
+            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+        );
+
         let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         let index_part = IndexPart::new(
             upload_queue.latest_files.clone(),
             disk_consistent_lsn,
-            upload_queue.latest_metadata.to_bytes()?,
+            metadata_bytes,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
         self.update_upload_queue_unfinished_metric(1, &op);
         upload_queue.queued_operations.push_back(op);
-
-        info!(
-            "scheduled metadata upload with {} files",
-            upload_queue.latest_files.len()
-        );
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
-
-        Ok(())
     }
 
     ///
@@ -644,6 +693,7 @@ impl RemoteTimelineClient {
         upload_queue
             .latest_files
             .insert(layer_file_name.clone(), layer_metadata.clone());
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
         self.update_upload_queue_unfinished_metric(1, &op);
@@ -662,8 +712,11 @@ impl RemoteTimelineClient {
     ///
     /// Launch a delete operation in the background.
     ///
-    /// The deletion won't actually be performed, until all preceding
-    /// upload operations have completed succesfully.
+    /// Note: This schedules an index file upload before the deletions.  The
+    /// deletion won't actually be performed, until any previously scheduled
+    /// upload operations, and the index file upload, have completed
+    /// succesfully.
+    ///
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
@@ -674,7 +727,6 @@ impl RemoteTimelineClient {
         // Deleting layers doesn't affect the values stored in TimelineMetadata,
         // so we don't need update it. Just serialize it.
         let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         // Update the remote index file, removing the to-be-deleted files from the index,
         // before deleting the actual files.
@@ -686,16 +738,12 @@ impl RemoteTimelineClient {
         let no_bail_here = || {
             for name in names {
                 upload_queue.latest_files.remove(name);
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
             }
 
-            let index_part = IndexPart::new(
-                upload_queue.latest_files.clone(),
-                disk_consistent_lsn,
-                metadata_bytes,
-            );
-            let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-            self.update_upload_queue_unfinished_metric(1, &op);
-            upload_queue.queued_operations.push_back(op);
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata_bytes);
+            }
 
             // schedule the actual deletions
             for name in names {
@@ -1244,15 +1292,19 @@ mod tests {
             assert!(upload_queue.queued_operations.is_empty());
             assert!(upload_queue.inprogress_tasks.len() == 2);
             assert!(upload_queue.num_inprogress_layer_uploads == 2);
+
+            // also check that `latest_file_changes` was updated
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
         }
 
         // Schedule upload of index. Check that it is queued
         let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload(&metadata)?;
+        client.schedule_index_upload_for_metadata_update(&metadata)?;
         {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
             assert!(upload_queue.queued_operations.len() == 1);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
         }
 
         // Wait for the uploads to finish
@@ -1288,6 +1340,7 @@ mod tests {
             assert!(upload_queue.inprogress_tasks.len() == 1);
             assert!(upload_queue.num_inprogress_layer_uploads == 1);
             assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
         }
         assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
 
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index ed4ed10189..bb58a34969 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -48,9 +48,17 @@ impl LayerFileMetadata {
     /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
     /// other value.
     ///
-    /// This is called on the possibly outdated version.
-    pub fn merge(&mut self, other: &Self) {
-        self.file_size = other.file_size.or(self.file_size);
+    /// This is called on the possibly outdated version. Returns true if any changes
+    /// were made.
+    pub fn merge(&mut self, other: &Self) -> bool {
+        let mut changed = false;
+
+        if self.file_size != other.file_size {
+            self.file_size = other.file_size.or(self.file_size);
+            changed = true;
+        }
+
+        changed
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3373c52231..b1f580c32f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -589,6 +589,18 @@ impl Timeline {
                 let timer = self.metrics.compact_time_histo.start_timer();
                 self.compact_level0(target_file_size).await?;
                 timer.stop_and_record();
+
+                // If `create_image_layers' or `compact_level0` scheduled any
+                // uploads or deletions, but didn't update the index file yet,
+                // do it now.
+                //
+                // This isn't necessary for correctness, the remote state is
+                // consistent without the uploads and deletions, and we would
+                // update the index file on next flush iteration too. But it
+                // could take a while until that happens.
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -1215,9 +1227,7 @@ impl Timeline {
             remote_client
                 .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
         }
-        if !local_only_layers.is_empty() {
-            remote_client.schedule_index_upload(up_to_date_metadata)?;
-        }
+        remote_client.schedule_index_upload_for_file_changes()?;
 
         info!("Done");
 
@@ -1923,13 +1933,9 @@ impl Timeline {
 
         if let Some(remote_client) = &self.remote_client {
             for (path, layer_metadata) in layer_paths_to_upload {
-                remote_client
-                    .schedule_layer_file_upload(&path, &layer_metadata)
-                    .context("schedule_layer_file_upload")?;
+                remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
             }
-            remote_client
-                .schedule_index_upload(&metadata)
-                .context("schedule_layer_file_upload")?;
+            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
         }
 
         Ok(())
@@ -2398,6 +2404,11 @@ impl Timeline {
             deltas_to_compact,
         } = self.compact_level0_phase1(target_file_size).await?;
 
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
         // Before deleting any layers, we need to wait for their upload ops to finish.
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index b9d012fa36..5f052bf81a 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -165,6 +165,11 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
         cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
         pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
         num_index_uploads = get_num_remote_ops("index", "upload")
+
+        # Also make sure that a no-op compaction doesn't upload the index
+        # file unnecessarily.
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
         log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
 
     after = num_index_uploads

From f9f57e211a69e8f67058dcb6841b59c7a5743924 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 01:55:59 +0200
Subject: [PATCH 108/167] Use local brokers

---
 test_runner/fixtures/neon_fixtures.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3e4809f24..59dd21f84c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,7 +286,7 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def default_broker(
     request: FixtureRequest,
     port_distributor: PortDistributor,
@@ -296,9 +296,8 @@ def default_broker(
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
     broker_logfile = (
-        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
+        get_test_repo_dir(request, top_output_dir) / f"storage_broker.log"
     )
-    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
@@ -1012,7 +1011,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
+        repo_dir = get_test_repo_dir(request, top_output_dir)
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -2791,6 +2790,9 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
+def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
+    return get_test_output_dir(request, top_output_dir) / "repo"
+
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",

From 56d8c25dc86145a3d2c72028d83b037eb673d9c0 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 01:57:36 +0200
Subject: [PATCH 109/167] Revert "Use local brokers"

This reverts commit f9f57e211a69e8f67058dcb6841b59c7a5743924.
---
 test_runner/fixtures/neon_fixtures.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 59dd21f84c..b3e4809f24 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,7 +286,7 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="session")
 def default_broker(
     request: FixtureRequest,
     port_distributor: PortDistributor,
@@ -296,8 +296,9 @@ def default_broker(
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
     broker_logfile = (
-        get_test_repo_dir(request, top_output_dir) / f"storage_broker.log"
+        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
     )
+    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
@@ -1011,7 +1012,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_repo_dir(request, top_output_dir)
+        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -2790,9 +2791,6 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
-def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
-    return get_test_output_dir(request, top_output_dir) / "repo"
-
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",

From 6ac9ecb074aceb7126666f29e45010fd7efc3dad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 20:50:02 +0200
Subject: [PATCH 110/167] Remove a few unnecessary checkpoint calls from unit
 tests.

The `make_some_layers' function performs a checkpoint already.
---
 pageserver/src/tenant.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0e59b43dda..64e214c5a2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2993,7 +2993,6 @@ mod tests {
                 .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
                 .initialize()?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
         }
 
         let tenant = harness.load().await;
@@ -3016,7 +3015,6 @@ mod tests {
                 .initialize()?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
 
             tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
 
@@ -3025,7 +3023,6 @@ mod tests {
                 .expect("Should have a local timeline");
 
             make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
         }
 
         // check that both of them are initially unloaded

From 7b0d28bbdce81cf470800aa27ef239b47116380c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 21:09:41 +0200
Subject: [PATCH 111/167] Update outdated comment on Tenant::gc_iteration.

Commit 6dec85b19d remove the `checkpoint_before_gc` argument, but failed
to update the comment. Remove its description, and while we're at it, try
to explain better how the `horizon` and `pitr` arguments are used.
---
 pageserver/src/tenant.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 64e214c5a2..edd7a3cb07 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1154,11 +1154,15 @@ impl Tenant {
     /// this function is periodically called by gc task.
     /// also it can be explicitly requested through page server api 'do_gc' command.
     ///
-    /// 'target_timeline_id' specifies the timeline to GC, or None for all.
-    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
-    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
-    /// to make tests more deterministic.
-    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
+    /// `target_timeline_id` specifies the timeline to GC, or None for all.
+    ///
+    /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained.
+    /// Also known as the retention period, or the GC cutoff point. `horizon` specifies
+    /// the amount of history, as LSN difference from current latest LSN on each timeline.
+    /// `pitr` specifies the same as a time difference from the current time. The effective
+    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
+    /// requires more history to be retained.
+    //
     pub async fn gc_iteration(
         &self,
         target_timeline_id: Option<TimelineId>,

From cd7fdf2587625ba963ef36b0e1d878e505732988 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 20 Dec 2022 12:03:42 +0100
Subject: [PATCH 112/167] Remove neon-stress configs (#3121)

---
 .github/ansible/neon-stress.hosts.yaml        | 32 ----------
 .../neon-stress.neon-storage-broker.yaml      | 56 -----------------
 .../helm-values/neon-stress.proxy-scram.yaml  | 52 ----------------
 .github/helm-values/neon-stress.proxy.yaml    | 61 -------------------
 4 files changed, 201 deletions(-)
 delete mode 100644 .github/ansible/neon-stress.hosts.yaml
 delete mode 100644 .github/helm-values/neon-stress.neon-storage-broker.yaml
 delete mode 100644 .github/helm-values/neon-stress.proxy-scram.yaml
 delete mode 100644 .github/helm-values/neon-stress.proxy.yaml

diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml
deleted file mode 100644
index 5d5df5a6d5..0000000000
--- a/.github/ansible/neon-stress.hosts.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-storage-ireland
-    bucket_region: eu-west-1
-    console_mgmt_base_url: http://neon-stress-console.local
-    broker_endpoint: http://storage-broker.neon-stress.local:50051
-    safekeeper_enable_s3_offload: 'false'
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: neon-stress/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-  children:
-    pageservers:
-      hosts:
-        neon-stress-ps-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-ps-2:
-          console_region_id: aws-eu-west-1
-    safekeepers:
-      hosts:
-        neon-stress-sk-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-2:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-3:
-          console_region_id: aws-eu-west-1
diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml
deleted file mode 100644
index e11e5d4214..0000000000
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: neon-stress
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
deleted file mode 100644
index ed580349fc..0000000000
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-fullnameOverride: "neon-stress-proxy-scram"
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://neon-stress-console.local/management/api/v2"
-  domain: "*.stress.neon.tech"
-  sentryEnvironment: "development"
-
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
deleted file mode 100644
index 94270ced09..0000000000
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-fullnameOverride: "neon-stress-proxy"
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.dev.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-service:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
-  type: LoadBalancer
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"

From 2c11f1fa95334b582372edb5a89aaeb1b779e6d7 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 13:06:21 +0200
Subject: [PATCH 113/167] Use separate broker per Python test (#3158)

And add its logs to Allure reports per test
---
 test_runner/fixtures/neon_fixtures.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3e4809f24..2eabc25ef6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,24 +286,19 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def default_broker(
-    request: FixtureRequest,
     port_distributor: PortDistributor,
-    top_output_dir: Path,
+    test_output_dir: Path,
     neon_binpath: Path,
 ) -> Iterator[NeonBroker]:
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
-    broker_logfile = (
-        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
-    )
-    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
+    broker_logfile = test_output_dir / "repo" / "storage_broker.log"
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
     broker.stop()
-    allure_attach_from_dir(Path(broker_logfile))
 
 
 @pytest.fixture(scope="session")
@@ -1012,7 +1007,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
+        repo_dir = get_test_repo_dir(request, top_output_dir)
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -2791,6 +2786,10 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
+def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
+    return get_test_output_dir(request, top_output_dir) / "repo"
+
+
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",

From eefb1d46f4837476aad2de872a2fcf873189a517 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 01:19:56 +0200
Subject: [PATCH 114/167] Replace Timeline::checkpoint with
 Timeline::freeze_and_flush

The new Timeline::freeze_and_flush function is equivalent to calling
Timeline::checkpoint(CheckpointConfig::Flush). There were only one
non-test caller that used CheckpointConfig::Forced, so replace that
with a call to the new Timeline::freeze_and_flush, followed by an
explicit call to Timeline::compact.

That only caller was to handle the mgmt API's 'checkpoint' endpoint.
Perhaps we should split that into separate 'flush' and 'compact'
endpoints too, but I didn't go that far yet.
---
 pageserver/src/http/routes.rs     |  8 ++--
 pageserver/src/lib.rs             |  9 ----
 pageserver/src/page_service.rs    |  3 +-
 pageserver/src/tenant.rs          | 68 +++++++++++++++----------------
 pageserver/src/tenant/timeline.rs | 19 ++-------
 pageserver/src/tenant_mgr.rs      |  2 +-
 6 files changed, 44 insertions(+), 65 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 68a26b8098..937a6144b6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -31,8 +31,6 @@ use utils::{
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
 use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
-#[cfg(feature = "testing")]
-use crate::CheckpointConfig;
 
 struct State {
     conf: &'static PageServerConf,
@@ -777,7 +775,11 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
     timeline
-        .checkpoint(CheckpointConfig::Forced)
+        .freeze_and_flush()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    timeline
+        .compact()
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index eafcaa88d9..5c4804db36 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -47,15 +47,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
-/// Config for the Repository checkpointer
-#[derive(Debug, Clone, Copy)]
-pub enum CheckpointConfig {
-    // Flush all in-memory data
-    Flush,
-    // Flush all in-memory data and reconstruct all page images
-    Forced,
-}
-
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 036fb14e9b..d9c19d04b7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -51,7 +51,6 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::{Tenant, Timeline};
 use crate::tenant_mgr;
 use crate::trace::Tracer;
-use crate::CheckpointConfig;
 
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
@@ -466,7 +465,7 @@ impl PageServerHandler {
         // We only want to persist the data, and it doesn't matter if it's in the
         // shape of deltas or images.
         info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush).await?;
+        timeline.freeze_and_flush().await?;
 
         info!("done");
         Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index edd7a3cb07..03387d00fe 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,7 +62,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
-use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 pub use pageserver_api::models::TenantState;
 
 use toml_edit;
@@ -125,7 +125,7 @@ pub struct Tenant {
     timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
-    // `timelines` mutex during all GC iteration (especially with enforced checkpoint)
+    // `timelines` mutex during all GC iteration
     // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
@@ -249,7 +249,7 @@ impl UninitializedTimeline<'_> {
                 .context("Failed to import basebackup")
         })?;
 
-        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // Flush loop needs to be spawned in order to be able to flush.
         // We want to run proper checkpoint before we mark timeline as available to outside world
         // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
         raw_timeline.maybe_spawn_flush_loop();
@@ -259,9 +259,9 @@ impl UninitializedTimeline<'_> {
         });
 
         raw_timeline
-            .checkpoint(CheckpointConfig::Flush)
+            .freeze_and_flush()
             .await
-            .context("Failed to checkpoint after basebackup import")?;
+            .context("Failed to flush after basebackup import")?;
 
         let timeline = self.initialize()?;
 
@@ -371,7 +371,7 @@ impl Drop for TimelineUninitMark {
 
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
-//     Checkpoint comes, we update local metadata and start upload task but after that
+//     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
 //     pageserver crashes. During startup we'll load new metadata, and then reset it
 //     to the state of remote one. But current layermap will have layers from the old
 //     metadata which is inconsistent.
@@ -1225,24 +1225,21 @@ impl Tenant {
     ///
     /// Used at graceful shutdown.
     ///
-    pub async fn checkpoint(&self) -> anyhow::Result<()> {
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
         // Scan through the hashmap and collect a list of all the timelines,
         // while holding the lock. Then drop the lock and actually perform the
-        // checkpoints. We don't want to block everything else while the
-        // checkpoint runs.
-        let timelines_to_checkpoint = {
+        // flushing. We don't want to block everything else while the
+        // flushing is performed.
+        let timelines_to_flush = {
             let timelines = self.timelines.lock().unwrap();
             timelines
                 .iter()
-                .map(|(id, timeline)| (*id, Arc::clone(timeline)))
+                .map(|(_id, timeline)| Arc::clone(timeline))
                 .collect::<Vec<_>>()
         };
 
-        for (id, timeline) in &timelines_to_checkpoint {
-            timeline
-                .checkpoint(CheckpointConfig::Flush)
-                .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
-                .await?;
+        for timeline in &timelines_to_flush {
+            timeline.freeze_and_flush().await?;
         }
 
         Ok(())
@@ -2095,8 +2092,13 @@ impl Tenant {
         });
 
         unfinished_timeline
-            .checkpoint(CheckpointConfig::Flush).await
-            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
+            .freeze_and_flush()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}"
+                )
+            })?;
 
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
@@ -2831,7 +2833,7 @@ mod tests {
             writer.finish_write(lsn);
             lsn += 0x10;
         }
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         {
             let writer = tline.writer();
             writer.put(
@@ -2848,7 +2850,7 @@ mod tests {
             )?;
             writer.finish_write(lsn);
         }
-        tline.checkpoint(CheckpointConfig::Forced).await
+        tline.freeze_and_flush().await
     }
 
     #[tokio::test]
@@ -2863,7 +2865,7 @@ mod tests {
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
+        // FIXME: this doesn't actually remove any layer currently, given how the flushing
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
@@ -3098,7 +3100,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3106,7 +3108,7 @@ mod tests {
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3114,7 +3116,7 @@ mod tests {
         writer.finish_write(Lsn(0x30));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3122,7 +3124,7 @@ mod tests {
         writer.finish_write(Lsn(0x40));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -3135,8 +3137,8 @@ mod tests {
     }
 
     //
-    // Insert 1000 key-value pairs with increasing keys, checkpoint,
-    // repeat 50 times.
+    // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
+    // Repeat 50 times.
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
@@ -3172,7 +3174,7 @@ mod tests {
             let cutoff = tline.get_last_record_lsn();
 
             tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3240,11 +3242,10 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3323,11 +3324,10 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b1f580c32f..0697ec4bd6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -57,7 +57,6 @@ use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
-use crate::CheckpointConfig;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
@@ -499,22 +498,10 @@ impl Timeline {
     }
 
     /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
     #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
-    pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
-        match cconf {
-            CheckpointConfig::Flush => {
-                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await
-            }
-            CheckpointConfig::Forced => {
-                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await?;
-                self.compact().await
-            }
-        }
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+        self.freeze_inmem_layer(false);
+        self.flush_frozen_layers_and_wait().await
     }
 
     pub async fn compact(&self) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index 615dcce4a1..85be420cb8 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -196,7 +196,7 @@ pub async fn shutdown_all_tenants() {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
-        if let Err(err) = tenant.checkpoint().await {
+        if let Err(err) = tenant.freeze_and_flush().await {
             error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
         }
     }

From 4cda9919bf30baed0255eee2e61dd1797ffb2cb1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 13:34:18 +0200
Subject: [PATCH 115/167] Use Self to emphasize this is a constructor

---
 libs/remote_storage/src/lib.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 28858fcbab..568cb7224f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -164,18 +164,16 @@ impl Deref for GenericRemoteStorage {
 }
 
 impl GenericRemoteStorage {
-    pub fn from_config(
-        storage_config: &RemoteStorageConfig,
-    ) -> anyhow::Result<GenericRemoteStorage> {
+    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs(root) => {
                 info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
+                Self::LocalFs(LocalFs::new(root.clone())?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
             }
         })
     }

From 8e2edfcf39c21e41cd3c2e74524ec7a777714555 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 13:34:21 +0200
Subject: [PATCH 116/167] Retry remote downloads.

Remote operations fail sometimes due to network failures or other
external reasons. Add retry logic to all the remote downloads, so that
a transient failure at pageserver startup or tenant attach doesn't
cause the whole tenant to be marked as Broken.

Like in the uploads retry logic, we print the failure to the log as a
WARNing after three retries, but keep retrying. We will retry up to 10
times now, before returning the error to the caller.

To test the retries, I created a new RemoteStorage wrapper that simulates
failures, by returning an error for the first N times that a remote
operation is performed. It can be enabled by setting a new
"test_remote_failures" option in the pageserver config file.

Fixes #3112
---
 libs/remote_storage/src/lib.rs                |  14 +-
 libs/remote_storage/src/local_fs.rs           |   8 +-
 libs/remote_storage/src/s3_bucket.rs          |  12 +-
 libs/remote_storage/src/simulate_failures.rs  | 129 +++++++++++
 pageserver/src/bin/pageserver.rs              |  10 +-
 pageserver/src/config.rs                      |  17 ++
 pageserver/src/storage_sync2.rs               |  57 ++++-
 pageserver/src/storage_sync2/download.rs      | 202 +++++++++++++-----
 test_runner/fixtures/neon_fixtures.py         |  22 ++
 test_runner/regress/test_remote_storage.py    |  24 ++-
 test_runner/regress/test_tenant_detach.py     |  52 +++++
 .../test_tenants_with_remote_storage.py       |  16 ++
 12 files changed, 480 insertions(+), 83 deletions(-)
 create mode 100644 libs/remote_storage/src/simulate_failures.rs

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 568cb7224f..1091a8bd5c 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -7,6 +7,7 @@
 //!
 mod local_fs;
 mod s3_bucket;
+mod simulate_failures;
 
 use std::{
     collections::HashMap,
@@ -24,7 +25,7 @@ use tokio::io;
 use toml_edit::Item;
 use tracing::info;
 
-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
 
 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -77,7 +78,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
     /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
     /// so this method doesnt need to.
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
@@ -150,6 +154,7 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
     LocalFs(LocalFs),
     AwsS3(Arc<S3Bucket>),
+    Unreliable(Arc<UnreliableWrapper>),
 }
 
 impl Deref for GenericRemoteStorage {
@@ -159,6 +164,7 @@ impl Deref for GenericRemoteStorage {
         match self {
             GenericRemoteStorage::LocalFs(local_fs) => local_fs,
             GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
+            GenericRemoteStorage::Unreliable(s) => s.as_ref(),
         }
     }
 }
@@ -178,6 +184,10 @@ impl GenericRemoteStorage {
         })
     }
 
+    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    }
+
     /// Takes storage object contents and its size and uploads to remote storage,
     /// mapping `from_path` to the corresponding remote object id in the storage.
     ///
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 50a84eb33f..f1289569ae 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -92,13 +92,17 @@ impl RemoteStorage for LocalFs {
             .collect())
     }
 
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         let path = match prefix {
             Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
             None => Cow::Borrowed(&self.storage_root),
         };
         Ok(get_all_files(path.as_ref(), false)
-            .await?
+            .await
+            .map_err(DownloadError::Other)?
             .into_iter()
             .map(|path| {
                 path.strip_prefix(&self.storage_root)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 740f3753d8..18a2c5dedd 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -286,7 +286,10 @@ impl RemoteStorage for S3Bucket {
 
     /// See the doc for `RemoteStorage::list_prefixes`
     /// Note: it wont include empty "directories"
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
             .map(|p| self.relative_path_to_s3_object(p))
@@ -308,7 +311,8 @@ impl RemoteStorage for S3Bucket {
                 .concurrency_limiter
                 .acquire()
                 .await
-                .context("Concurrency limiter semaphore got closed during S3 list")?;
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
 
             metrics::inc_list_objects();
 
@@ -324,7 +328,9 @@ impl RemoteStorage for S3Bucket {
                 .map_err(|e| {
                     metrics::inc_list_objects_fail();
                     e
-                })?;
+                })
+                .context("Failed to list S3 prefixes")
+                .map_err(DownloadError::Other)?;
 
             document_keys.extend(
                 fetch_response
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
new file mode 100644
index 0000000000..643bb99dce
--- /dev/null
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -0,0 +1,129 @@
+//! This module provides a wrapper around a real RemoteStorage implementation that
+//! causes the first N attempts at each upload or download operatio to fail. For
+//! testing purposes.
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+
+pub struct UnreliableWrapper {
+    inner: crate::GenericRemoteStorage,
+
+    // This many attempts of each operation will fail, then we let it succeed.
+    attempts_to_fail: u64,
+
+    // Tracks how many failed attempts of each operation has been made.
+    attempts: Mutex<HashMap<RemoteOp, u64>>,
+}
+
+/// Used to identify retries of different unique operation.
+#[derive(Debug, Hash, Eq, PartialEq)]
+enum RemoteOp {
+    List,
+    ListPrefixes(Option<RemotePath>),
+    Upload(RemotePath),
+    Download(RemotePath),
+    Delete(RemotePath),
+}
+
+impl UnreliableWrapper {
+    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+        assert!(attempts_to_fail > 0);
+        UnreliableWrapper {
+            inner,
+            attempts_to_fail,
+            attempts: Mutex::new(HashMap::new()),
+        }
+    }
+
+    ///
+    /// Common functionality for all operations.
+    ///
+    /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
+    /// attempts, let the operation go ahead, and clear the counter.
+    ///
+    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
+        let mut attempts = self.attempts.lock().unwrap();
+
+        match attempts.entry(op) {
+            Entry::Occupied(mut e) => {
+                let attempts_before_this = {
+                    let p = e.get_mut();
+                    *p += 1;
+                    *p
+                };
+
+                if attempts_before_this >= self.attempts_to_fail {
+                    // let it succeed
+                    e.remove();
+                    Ok(attempts_before_this)
+                } else {
+                    let error =
+                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                    Err(DownloadError::Other(error))
+                }
+            }
+            Entry::Vacant(e) => {
+                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                e.insert(1);
+                Err(DownloadError::Other(error))
+            }
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for UnreliableWrapper {
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::List)?;
+        self.inner.list().await
+    }
+
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list_prefixes(prefix).await
+    }
+
+    async fn upload(
+        &self,
+        data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        // S3 PUT request requires the content length to be specified,
+        // otherwise it starts to fail with the concurrent connection count increasing.
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.upload(data, data_size_bytes, to, metadata).await
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner.download(from).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        // Note: We treat any download_byte_range as an "attempt" of the same
+        // operation. We don't pay attention to the ranges. That's good enough
+        // for now.
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner
+            .download_byte_range(from, start_inclusive, end_exclusive)
+            .await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Delete(path.clone()))?;
+        self.inner.delete(path).await
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 47e9382e6d..86ce318d0a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -12,14 +12,13 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_service, profiling, storage_sync2, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
     },
     tenant_mgr, virtual_file,
 };
-use remote_storage::GenericRemoteStorage;
 use utils::{
     auth::JwtAuth,
     logging,
@@ -281,12 +280,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
 
     // Set up remote storage client
-    let remote_storage = conf
-        .remote_storage_config
-        .as_ref()
-        .map(GenericRemoteStorage::from_config)
-        .transpose()
-        .context("Failed to init generic remote storage")?;
+    let remote_storage = storage_sync2::create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
     BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9971ddc0f7..93c221e622 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -143,6 +143,8 @@ pub struct PageServerConf {
 
     /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
+
+    pub test_remote_failures: u64,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -221,6 +223,8 @@ struct PageServerConfigBuilder {
     log_format: BuilderValue<LogFormat>,
 
     concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
+
+    test_remote_failures: BuilderValue<u64>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -256,6 +260,8 @@ impl Default for PageServerConfigBuilder {
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+
+            test_remote_failures: Set(0),
         }
     }
 }
@@ -336,6 +342,10 @@ impl PageServerConfigBuilder {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
 
+    pub fn test_remote_failures(&mut self, fail_first: u64) {
+        self.test_remote_failures = BuilderValue::Set(fail_first);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -384,6 +394,9 @@ impl PageServerConfigBuilder {
                 .ok_or(anyhow!(
                     "missing concurrent_tenant_size_logical_size_queries"
                 ))?,
+            test_remote_failures: self
+                .test_remote_failures
+                .ok_or(anyhow!("missing test_remote_failuers"))?,
         })
     }
 }
@@ -555,6 +568,7 @@ impl PageServerConf {
                     let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
                     ConfigurableSemaphore::new(permits)
                 }),
+                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -676,6 +690,7 @@ impl PageServerConf {
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            test_remote_failures: 0,
         }
     }
 }
@@ -849,6 +864,7 @@ log_format = 'json'
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                test_remote_failures: 0,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -893,6 +909,7 @@ log_format = 'json'
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                test_remote_failures: 0,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 14763985ab..14ab332eba 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -227,6 +227,18 @@ use crate::{
 
 use utils::id::{TenantId, TimelineId};
 
+// Occasional network issues and such can cause remote operations to fail, and
+// that's expected. If a download fails, we log it at info-level, and retry.
+// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
+// level instead, as repeated failures can mean a more serious problem. If it
+// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
+const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+
+// Similarly log failed uploads and deletions at WARN level, after this many
+// retries. Uploads and deletions are retried forever, though.
+const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -977,12 +989,14 @@ impl RemoteTimelineClient {
                 Err(e) => {
                     let retries = task.retries.fetch_add(1, Ordering::SeqCst);
 
-                    // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
-                    // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
-                    // people and tests until the retries are definitely causing delays.
-                    if retries < 3 {
+                    // Uploads can fail due to rate limits (IAM, S3), spurious network problems,
+                    // or other external reasons. Such issues are relatively regular, so log them
+                    // at info level at first, and only WARN if the operation fails repeatedly.
+                    //
+                    // (See similar logic for downloads in `download::download_retry`)
+                    if retries < FAILED_UPLOAD_WARN_THRESHOLD {
                         info!(
-                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            "failed to perform remote task {}, will retry (attempt {}): {:#}",
                             task.op, retries, e
                         );
                     } else {
@@ -1148,6 +1162,39 @@ pub fn create_remote_timeline_client(
     })
 }
 
+///
+/// Create GenericRemoteStorage client from the pageserver config
+///
+pub fn create_remote_storage_client(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<Option<GenericRemoteStorage>> {
+    let config = if let Some(config) = &conf.remote_storage_config {
+        config
+    } else {
+        // No remote storage configured.
+        return Ok(None);
+    };
+
+    // Create the client
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+
+    // If `test_remote_failures` is non-zero, wrap the client with a
+    // wrapper that simulates failures.
+    if conf.test_remote_failures > 0 {
+        if !cfg!(feature = "testing") {
+            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+        }
+        info!(
+            "Simulating remote failures for first {} attempts of each op",
+            conf.test_remote_failures
+        );
+        remote_storage =
+            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+    }
+
+    Ok(Some(remote_storage))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index 0d25d88a97..c81be05981 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -1,21 +1,28 @@
 //! Helper functions to download files from remote storage with a RemoteStorage
+//!
+//! The functions in this module retry failed operations automatically, according
+//! to the FAILED_DOWNLOAD_RETRIES constant.
+
 use std::collections::HashSet;
+use std::future::Future;
 use std::path::Path;
 
-use anyhow::{bail, Context};
+use anyhow::{anyhow, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{debug, info_span, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 use crate::config::PageServerConf;
 use crate::storage_sync::index::LayerFileMetadata;
 use crate::tenant::filename::LayerFileName;
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, IndexPartUnclean};
+use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -33,12 +40,14 @@ pub async fn download_layer_file<'a>(
     timeline_id: TimelineId,
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
-) -> anyhow::Result<u64> {
+) -> Result<u64, DownloadError> {
     let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
 
     let local_path = timeline_path.join(layer_file_name.file_name());
 
-    let remote_path = conf.remote_path(&local_path)?;
+    let remote_path = conf
+        .remote_path(&local_path)
+        .map_err(DownloadError::Other)?;
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
@@ -52,21 +61,30 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    // TODO: this doesn't use the cached fd for some reason?
-    let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-        format!(
-            "Failed to create a destination file for layer '{}'",
-            temp_file_path.display()
-        )
-    })?;
-    let mut download = storage.download(&remote_path).await.with_context(|| {
-        format!(
-            "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
-        )
-    })?;
-    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-        format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-    })?;
+    let (mut destination_file, bytes_amount) = download_retry(
+        || async {
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
+                format!(
+                    "Failed to create a destination file for layer '{}'",
+                    temp_file_path.display()
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let mut download = storage.download(&remote_path).await.with_context(|| {
+                format!(
+                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok((destination_file, bytes_amount))
+        },
+        &format!("download {remote_path:?}"),
+    ).await?;
 
     // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
     // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -76,19 +94,23 @@ pub async fn download_layer_file<'a>(
     // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
     // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
     // But for additional safety lets check/wait for any pending operations.
-    destination_file.flush().await.with_context(|| {
-        format!(
-            "failed to flush source file at {}",
-            temp_file_path.display()
-        )
-    })?;
+    destination_file
+        .flush()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to flush source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
 
     match layer_metadata.file_size() {
         Some(expected) if expected != bytes_amount => {
-            anyhow::bail!(
-                "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
+            return Err(DownloadError::Other(anyhow!(
+                "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
                 temp_file_path.display()
-            );
+            )));
         }
         Some(_) | None => {
             // matches, or upgrading from an earlier IndexPart version
@@ -96,23 +118,38 @@ pub async fn download_layer_file<'a>(
     }
 
     // not using sync_data because it can lose file size update
-    destination_file.sync_all().await.with_context(|| {
-        format!(
-            "failed to fsync source file at {}",
-            temp_file_path.display()
-        )
-    })?;
+    destination_file
+        .sync_all()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to fsync source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
     drop(destination_file);
 
     fail::fail_point!("remote-storage-download-pre-rename", |_| {
-        bail!("remote-storage-download-pre-rename failpoint triggered")
+        Err(DownloadError::Other(anyhow!(
+            "remote-storage-download-pre-rename failpoint triggered"
+        )))
     });
 
-    fs::rename(&temp_file_path, &local_path).await?;
+    fs::rename(&temp_file_path, &local_path)
+        .await
+        .with_context(|| {
+            format!(
+                "Could not rename download layer file to {}",
+                local_path.display(),
+            )
+        })
+        .map_err(DownloadError::Other)?;
 
     fsync_path(&local_path)
         .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
+        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .map_err(DownloadError::Other)?;
 
     tracing::info!("download complete: {}", local_path.display());
 
@@ -143,14 +180,11 @@ pub async fn list_remote_timelines<'a>(
     let tenant_path = conf.timelines_path(&tenant_id);
     let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
-    let timelines = storage
-        .list_prefixes(Some(&tenant_storage_path))
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
-            )
-        })?;
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&tenant_storage_path)),
+        &format!("list prefixes for {tenant_path:?}"),
+    )
+    .await?;
 
     if timelines.is_empty() {
         anyhow::bail!("no timelines found on the remote storage")
@@ -209,16 +243,25 @@ pub async fn download_index_part(
         .remote_path(&index_part_path)
         .map_err(DownloadError::BadInput)?;
 
-    let mut index_part_download = storage.download(&part_storage_path).await?;
+    let index_part_bytes = download_retry(
+        || async {
+            let mut index_part_download = storage.download(&part_storage_path).await?;
 
-    let mut index_part_bytes = Vec::new();
-    tokio::io::copy(
-        &mut index_part_download.download_stream,
-        &mut index_part_bytes,
+            let mut index_part_bytes = Vec::new();
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| {
+                format!("Failed to download an index part into file {index_part_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok(index_part_bytes)
+        },
+        &format!("download {part_storage_path:?}"),
     )
-    .await
-    .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
-    .map_err(DownloadError::Other)?;
+    .await?;
 
     let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
         .with_context(|| {
@@ -230,3 +273,56 @@ pub async fn download_index_part(
 
     Ok(index_part)
 }
+
+///
+/// Helper function to handle retries for a download operation.
+///
+/// Remote operations can fail due to rate limits (IAM, S3), spurious network
+/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
+/// with backoff.
+///
+/// (See similar logic for uploads in `perform_upload_task`)
+async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+where
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, DownloadError>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
+                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
+                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(ref err)) => {
+                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
+                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2eabc25ef6..287f157d97 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1904,6 +1904,28 @@ class NeonPageserver(PgProtocol):
 
         assert not errors
 
+    def log_contains(self, pattern: str) -> Optional[str]:
+        """Check that the pageserver log contains a line that matches the given regex"""
+        logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
+
+        contains_re = re.compile(pattern)
+
+        # XXX: Our rust logging machinery buffers the messages, so if you
+        # call this function immediately after it's been logged, there is
+        # no guarantee it is already present in the log file. This hasn't
+        # been a problem in practice, our python tests are not fast enough
+        # to hit that race condition.
+        while True:
+            line = logfile.readline()
+            if not line:
+                break
+
+            if contains_re.search(line):
+                # found it!
+                return line
+
+        return None
+
 
 def append_pageserver_param_overrides(
     params_to_update: List[str],
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index d8f8298fa6..94e483cdb5 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -56,6 +56,11 @@ def test_remote_storage_backup_and_restore(
         test_name="test_remote_storage_backup_and_restore",
     )
 
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     data_id = 1
     data_secret = "very secret secret"
 
@@ -76,6 +81,7 @@ def test_remote_storage_backup_and_restore(
     env.pageserver.allowed_errors.append(
         ".*Cannot attach tenant .*?, local tenant directory already exists.*"
     )
+    env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")
 
     pageserver_http = env.pageserver.http_client()
     pg = env.postgres.create_start("main")
@@ -87,16 +93,6 @@ def test_remote_storage_backup_and_restore(
 
     checkpoint_numbers = range(1, 3)
 
-    # On the first iteration, exercise retry code path by making the uploads
-    # fail for the first 3 times
-    action = "3*return->off"
-    pageserver_http.configure_failpoints(
-        [
-            ("before-upload-layer", action),
-            ("before-upload-index", action),
-        ]
-    )
-
     for checkpoint_number in checkpoint_numbers:
         with pg.cursor() as cur:
             cur.execute(
@@ -118,6 +114,14 @@ def test_remote_storage_backup_and_restore(
         wait_for_upload(client, tenant_id, timeline_id, current_lsn)
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
     ##### Stop the first pageserver instance, erase all its data
     env.postgres.stop_all()
     env.pageserver.stop()
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index ce1e334bfa..8bf0fb7548 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -32,6 +32,58 @@ def do_gc_target(
         log.info("gc http thread returning")
 
 
+# Basic detach and re-attach test
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach",
+    )
+
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE t(key int primary key, value text)")
+        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # Wait for the all data to be processed by the pageserver and uploaded in remote storage
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+    # Check that we had to retry the downloads
+    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+
+
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 57aaa70559..4cd74e17e9 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -121,6 +121,11 @@ def test_tenants_attached_after_download(
     data_id = 1
     data_secret = "very secret secret"
 
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
@@ -159,6 +164,14 @@ def test_tenants_attached_after_download(
         wait_for_upload(client, tenant_id, timeline_id, current_lsn)
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
     env.postgres.stop_all()
 
@@ -211,6 +224,9 @@ def test_tenants_attached_after_download(
     )
     assert detail_before["current_physical_size"] == detail_after["current_physical_size"]
 
+    # Check that we had to retry the downloads
+    assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
+
 
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_upgrades_index_json_from_v0(

From 0c71dc627bf0727265ee8fec35cd38c234315842 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 15:54:02 +0200
Subject: [PATCH 117/167] Tidy up walreceiver logs (#3147)

Closes https://github.com/neondatabase/neon/issues/3114

Improves walrecevier logs and remove `clone()` calls.
---
 .../src/walreceiver/connection_manager.rs     | 72 +++++++++++--------
 .../src/walreceiver/walreceiver_connection.rs | 26 +++++--
 2 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index 8048707480..a65703bca9 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -145,21 +145,17 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(c) => {
-                        match c {
-                            TaskStateUpdate::Init | TaskStateUpdate::Started => {},
-                            TaskStateUpdate::Progress(status) => {
-                                if status.has_processed_wal {
-                                    // We have advanced last_record_lsn by processing the WAL received
-                                    // from this safekeeper. This is good enough to clean unsuccessful
-                                    // retries history and allow reconnecting to this safekeeper without
-                                    // sleeping for a long time.
-                                    walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
-                                }
-                                wal_connection.status = status.to_owned();
-                            }
+                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
+                        if new_status.has_processed_wal {
+                            // We have advanced last_record_lsn by processing the WAL received
+                            // from this safekeeper. This is good enough to clean unsuccessful
+                            // retries history and allow reconnecting to this safekeeper without
+                            // sleeping for a long time.
+                            walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
                         }
-                    },
+                        wal_connection.status = new_status;
+                    }
                     TaskEvent::End(walreceiver_task_result) => {
                         match walreceiver_task_result {
                             Ok(()) => debug!("WAL receiving task finished"),
@@ -210,7 +206,18 @@ async fn connection_manager_loop_step(
                 }
             },
 
-            _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
+            Some(()) = async {
+                match time_until_next_retry {
+                    Some(sleep_time) => {
+                        tokio::time::sleep(sleep_time).await;
+                        Some(())
+                    },
+                    None => {
+                        debug!("No candidates to retry, waiting indefinitely for the broker events");
+                        None
+                    }
+                }
+            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
         }
 
         if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
@@ -480,20 +487,25 @@ impl WalreceiverState {
             .values()
             .filter_map(|retry| retry.next_retry_at)
             .filter(|next_retry_at| next_retry_at > &now)
-            .min();
+            .min()?;
 
-        next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
+        (next_retry_at - now).to_std().ok()
     }
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
     fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
-        self.wal_stream_candidates.insert(
-            NodeId(timeline_update.safekeeper_id),
+        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
+        let old_entry = self.wal_stream_candidates.insert(
+            new_safekeeper_id,
             BrokerSkTimeline {
                 timeline: timeline_update,
                 latest_update: Utc::now().naive_utc(),
             },
         );
+
+        if old_entry.is_none() {
+            info!("New SK node was added: {new_safekeeper_id}");
+        }
     }
 
     /// Cleans up stale broker records and checks the rest for the new connection candidate.
@@ -720,12 +732,13 @@ impl WalreceiverState {
     /// Remove candidates which haven't sent broker updates for a while.
     fn cleanup_old_candidates(&mut self) {
         let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
+        let lagging_wal_timeout = self.lagging_wal_timeout;
 
         self.wal_stream_candidates.retain(|node_id, broker_info| {
             if let Ok(time_since_latest_broker_update) =
                 (Utc::now().naive_utc() - broker_info.latest_update).to_std()
             {
-                let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout;
+                let should_retain = time_since_latest_broker_update < lagging_wal_timeout;
                 if !should_retain {
                     node_ids_to_remove.push(*node_id);
                 }
@@ -735,8 +748,11 @@ impl WalreceiverState {
             }
         });
 
-        for node_id in node_ids_to_remove {
-            self.wal_connection_retries.remove(&node_id);
+        if !node_ids_to_remove.is_empty() {
+            for node_id in node_ids_to_remove {
+                info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
+                self.wal_connection_retries.remove(&node_id);
+            }
         }
     }
 
@@ -883,10 +899,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
@@ -1045,10 +1061,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
@@ -1110,10 +1126,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: NodeId(1),
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index cf2a99f1b5..5b7e60aa5e 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -35,7 +35,7 @@ use pq_proto::ReplicationFeedback;
 use utils::lsn::Lsn;
 
 /// Status of the connection.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct WalConnectionStatus {
     /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
     pub is_connected: bool,
@@ -83,7 +83,7 @@ pub async fn handle_walreceiver_connection(
         streaming_lsn: None,
         commit_lsn: None,
     };
-    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
         return Ok(());
     }
@@ -135,7 +135,7 @@ pub async fn handle_walreceiver_connection(
     connection_status.latest_connection_update = Utc::now().naive_utc();
     connection_status.latest_wal_update = Utc::now().naive_utc();
     connection_status.commit_lsn = Some(end_of_wal);
-    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
         return Ok(());
     }
@@ -184,7 +184,20 @@ pub async fn handle_walreceiver_connection(
             replication_message = physical_stream.next() => replication_message,
         }
     } {
-        let replication_message = replication_message?;
+        let replication_message = match replication_message {
+            Ok(message) => message,
+            Err(replication_error) => {
+                if replication_error.is_closed() {
+                    info!("Replication stream got closed");
+                    return Ok(());
+                } else {
+                    return Err(
+                        anyhow::Error::new(replication_error).context("replication stream error")
+                    );
+                }
+            }
+        };
+
         let now = Utc::now().naive_utc();
         let last_rec_lsn_before_msg = last_rec_lsn;
 
@@ -207,7 +220,7 @@ pub async fn handle_walreceiver_connection(
             }
             &_ => {}
         };
-        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
             warn!("Wal connection event listener dropped, aborting the connection: {e}");
             return Ok(());
         }
@@ -273,8 +286,7 @@ pub async fn handle_walreceiver_connection(
         if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
             // We have successfully processed at least one WAL record.
             connection_status.has_processed_wal = true;
-            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
-            {
+            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
                 warn!("Wal connection event listener dropped, aborting the connection: {e}");
                 return Ok(());
             }

From 9a049aa846aeef3e5d1e3be70b670c6697ba8e35 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 01:42:54 +0200
Subject: [PATCH 118/167] Move code from tenant_mgr::delete_timeline to
 Tenant::delete_timeline.

It's better to request the tasks to shut down only after setting the
timeline state to Stopping. Otherwise, it's possible that a new task
spawns after we have waited for the existing tasks to shut down, but
before we have changed the state. We would fail to wait for them.

Feels nicer from a readability point of view too.
---
 pageserver/src/tenant.rs     | 23 ++++++++++++++++++++++-
 pageserver/src/tenant_mgr.rs | 21 ---------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 03387d00fe..af31fda06b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1274,8 +1274,29 @@ impl Tenant {
             timeline
         };
 
-        info!("waiting for layer_removal_cs.lock()");
+        // Now that the Timeline is in Stopping state, request all the related tasks to
+        // shut down.
+        //
+        // NB: If you call delete_timeline multiple times concurrently, they will
+        // all go through the motions here. Make sure the code here is idempotent,
+        // and don't error out if some of the shutdown tasks have already been
+        // completed!
+
+        // Stop the walreceiver first.
+        debug!("waiting for wal receiver to shutdown");
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_id),
+            Some(timeline_id),
+        )
+        .await;
+        debug!("wal receiver shutdown confirmed");
+
+        info!("waiting for timeline tasks to shutdown");
+        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
+
         // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
+        info!("waiting for layer_removal_cs.lock()");
         let layer_removal_guard = timeline.layer_removal_cs.lock().await;
         info!("got layer_removal_cs.lock(), deleting layer files");
 
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index 85be420cb8..e4e9d0c6e8 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -262,27 +262,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
 }
 
 pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
-    // Start with the shutdown of timeline tasks (this shuts down the walreceiver)
-    // It is important that we do not take locks here, and do not check whether the timeline exists
-    // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
-    // we cannot create new timelines and tenants, and that can take quite some time,
-    // it can even become stuck due to a bug making whole pageserver unavailable for some operations
-    // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
-    // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
-    // will synchronize and either fail with the not found error or succeed
-
-    debug!("waiting for wal receiver to shutdown");
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::WalReceiverManager),
-        Some(tenant_id),
-        Some(timeline_id),
-    )
-    .await;
-    debug!("wal receiver shutdown confirmed");
-
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
-    info!("timeline task shutdown completed");
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
             tenant.delete_timeline(timeline_id).await?;

From 43fd89eaa7e9100c5f74ce7082dc19e0a9f3135d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 01:52:50 +0200
Subject: [PATCH 119/167] Improve comments, formatting around layer_removal_cs
 lock.

---
 pageserver/src/tenant.rs | 49 ++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index af31fda06b..7a03f52155 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1295,26 +1295,41 @@ impl Tenant {
         info!("waiting for timeline tasks to shutdown");
         task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
 
-        // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
-        info!("waiting for layer_removal_cs.lock()");
-        let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-        info!("got layer_removal_cs.lock(), deleting layer files");
+        {
+            // Grab the layer_removal_cs lock, and actually perform the deletion.
+            //
+            // This lock prevents multiple concurrent delete_timeline calls from
+            // stepping on each other's toes, while deleting the files. It also
+            // prevents GC or compaction from running at the same time.
+            //
+            // Note that there are still other race conditions between
+            // GC, compaction and timeline deletion. GC task doesn't
+            // register itself properly with the timeline it's
+            // operating on. See
+            // https://github.com/neondatabase/neon/issues/2671
+            //
+            // No timeout here, GC & Compaction should be responsive to the
+            // `TimelineState::Stopping` change.
+            info!("waiting for layer_removal_cs.lock()");
+            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            info!("got layer_removal_cs.lock(), deleting layer files");
 
-        // NB: storage_sync upload tasks that reference these layers have been cancelled
-        //     by the caller.
+            // NB: storage_sync upload tasks that reference these layers have been cancelled
+            //     by the caller.
 
-        let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-        // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-        // with some layers missing.
-        std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-            format!(
-                "Failed to remove local timeline directory '{}'",
-                local_timeline_directory.display()
-            )
-        })?;
-        info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
+            // with some layers missing.
+            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
+                format!(
+                    "Failed to remove local timeline directory '{}'",
+                    local_timeline_directory.display()
+                )
+            })?;
 
-        drop(layer_removal_guard);
+            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            drop(layer_removal_guard);
+        }
 
         // Remove the timeline from the map.
         let mut timelines = self.timelines.lock().unwrap();

From 4235f97c6a3276d90a1c3630fa78d4b1495df260 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 29 Nov 2022 20:50:58 +0200
Subject: [PATCH 120/167] Implement consumption metrics collection.

Add new background job to collect billing metrics for each tenant and
send them to the HTTP endpoint.
Metrics are cached, so we don't send non-changed metrics.

Add metric collection config parameters:
metric_collection_endpoint (default None, i.e. disabled)
metric_collection_interval (default 60s)

Add test_metric_collection.py to test metric collection
and sending to the mocked HTTP endpoint.

Use port distributor in metric_collection test

review fixes: only update cache after metrics were send successfully, simplify code

disable metric collection if metric_collection_endpoint is not provided in config
---
 Cargo.lock                                    |   3 +
 pageserver/Cargo.toml                         |   5 +-
 pageserver/src/billing_metrics.rs             | 283 ++++++++++++++++++
 pageserver/src/bin/pageserver.rs              |  20 ++
 pageserver/src/config.rs                      |  47 +++
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/storage_sync2.rs               |   4 +
 pageserver/src/task_mgr.rs                    |   3 +
 pageserver/src/tenant.rs                      |  16 +
 poetry.lock                                   |  44 ++-
 pyproject.toml                                |   1 +
 test_runner/regress/test_metric_collection.py | 138 +++++++++
 workspace_hack/Cargo.toml                     |   4 +-
 13 files changed, 557 insertions(+), 12 deletions(-)
 create mode 100644 pageserver/src/billing_metrics.rs
 create mode 100644 test_runner/regress/test_metric_collection.py

diff --git a/Cargo.lock b/Cargo.lock
index 665000746d..2737a4d934 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2415,6 +2415,7 @@ dependencies = [
  "rand",
  "regex",
  "remote_storage",
+ "reqwest",
  "rstar",
  "scopeguard",
  "serde",
@@ -4753,6 +4754,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "bytes",
+ "chrono",
  "clap 4.0.29",
  "crossbeam-utils",
  "either",
@@ -4776,6 +4778,7 @@ dependencies = [
  "reqwest",
  "scopeguard",
  "serde",
+ "serde_json",
  "socket2",
  "stable_deref_trait",
  "syn",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 24642ca2f7..f5acfcbdc0 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -18,7 +18,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
+chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4.0", features = ["string"] }
 close_fds = "0.3.2"
 const_format = "0.2.21"
@@ -45,7 +45,7 @@ regex = "1.4.5"
 rstar = "0.9.3"
 scopeguard = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
+serde_json = { version = "1.0", features = ["raw_value"] }
 serde_with = "2.0"
 signal-hook = "0.3.10"
 svg_fmt = "0.4.1"
@@ -69,6 +69,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+reqwest = "0.11.13"
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
new file mode 100644
index 0000000000..c5da54b8fc
--- /dev/null
+++ b/pageserver/src/billing_metrics.rs
@@ -0,0 +1,283 @@
+//!
+//! Periodically collect consumption metrics for all active tenants
+//! and push them to a HTTP endpoint.
+//! Cache metrics to send only the updated ones.
+//!
+
+use anyhow;
+use tracing::*;
+use utils::id::TimelineId;
+
+use crate::task_mgr;
+use crate::tenant_mgr;
+use pageserver_api::models::TenantState;
+use utils::id::TenantId;
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fmt;
+use std::str::FromStr;
+use std::time::Duration;
+
+use chrono::{DateTime, Utc};
+use reqwest::Url;
+
+/// BillingMetric struct that defines the format for one metric entry
+/// i.e.
+///
+/// ```json
+/// {
+/// "metric": "remote_storage_size",
+/// "type": "absolute",
+/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
+/// "timeline_id": "00000000000000000000000000000000",
+/// "time": ...,
+/// "value": 12345454,
+/// }
+/// ```
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct BillingMetric {
+    pub metric: BillingMetricKind,
+    pub metric_type: &'static str,
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub time: DateTime<Utc>,
+    pub value: u64,
+}
+
+impl BillingMetric {
+    pub fn new_absolute(
+        metric: BillingMetricKind,
+        tenant_id: TenantId,
+        timeline_id: Option<TimelineId>,
+        value: u64,
+    ) -> Self {
+        Self {
+            metric,
+            metric_type: "absolute",
+            tenant_id,
+            timeline_id,
+            time: Utc::now(),
+            value,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum BillingMetricKind {
+    /// Amount of WAL produced , by a timeline, i.e. last_record_lsn
+    /// This is an absolute, per-timeline metric.
+    WrittenSize,
+    /// Size of all tenant branches including WAL
+    /// This is an absolute, per-tenant metric.
+    /// This is the same metric that tenant/tenant_id/size endpoint returns.
+    SyntheticStorageSize,
+    /// Size of all the files in the tenant's directory on disk on the pageserver.
+    /// This is an absolute, per-tenant metric.
+    /// See also prometheus metric CURRENT_PHYSICAL_SIZE.
+    PhysicalSize,
+    /// Size of the remote storage (S3) directory.
+    /// This is an absolute, per-tenant metric.
+    RemoteStorageSize,
+}
+
+impl FromStr for BillingMetricKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "written_size" => Ok(Self::WrittenSize),
+            "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
+            "physical_size" => Ok(Self::PhysicalSize),
+            "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
+        }
+    }
+}
+
+impl fmt::Display for BillingMetricKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            BillingMetricKind::WrittenSize => "written_size",
+            BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
+            BillingMetricKind::PhysicalSize => "physical_size",
+            BillingMetricKind::RemoteStorageSize => "remote_storage_size",
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct BillingMetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: BillingMetricKind,
+}
+
+#[derive(serde::Serialize)]
+struct EventChunk<'a> {
+    events: &'a [BillingMetric],
+}
+
+/// Main thread that serves metrics collection
+pub async fn collect_metrics(
+    metric_collection_endpoint: &Url,
+    metric_collection_interval: Duration,
+) -> anyhow::Result<()> {
+    let mut ticker = tokio::time::interval(metric_collection_interval);
+
+    info!("starting collect_metrics");
+
+    // define client here to reuse it for all requests
+    let client = reqwest::Client::new();
+    let mut cached_metrics: HashMap<BillingMetricsKey, u64> = HashMap::new();
+
+    loop {
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => {
+                info!("collect_metrics received cancellation request");
+                return Ok(());
+            },
+            _ = ticker.tick() => {
+                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint).await?;
+            }
+        }
+    }
+}
+
+/// One iteration of metrics collection
+///
+/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
+/// Cache metrics to avoid sending the same metrics multiple times.
+pub async fn collect_metrics_task(
+    client: &reqwest::Client,
+    cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
+    metric_collection_endpoint: &reqwest::Url,
+) -> anyhow::Result<()> {
+    let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
+    trace!(
+        "starting collect_metrics_task. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );
+
+    // get list of tenants
+    let tenants = tenant_mgr::list_tenants().await;
+
+    // iterate through list of Active tenants and collect metrics
+    for (tenant_id, tenant_state) in tenants {
+        if tenant_state != TenantState::Active {
+            continue;
+        }
+
+        let tenant = tenant_mgr::get_tenant(tenant_id, true).await?;
+
+        let mut tenant_physical_size = 0;
+
+        // iterate through list of timelines in tenant
+        for timeline in tenant.list_timelines().iter() {
+            let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+
+            current_metrics.push((
+                BillingMetricsKey {
+                    tenant_id,
+                    timeline_id: Some(timeline.timeline_id),
+                    metric: BillingMetricKind::WrittenSize,
+                },
+                timeline_written_size,
+            ));
+
+            let timeline_size = timeline.get_physical_size();
+            tenant_physical_size += timeline_size;
+
+            debug!(
+                "per-timeline current metrics for tenant: {}: timeline {} physical_size={} last_record_lsn {} (as bytes)",
+                tenant_id, timeline.timeline_id, timeline_size, timeline_written_size)
+        }
+
+        let tenant_remote_size = tenant.get_remote_size().await?;
+        debug!(
+            "collected current metrics for tenant: {}: state={:?} tenant_physical_size={} remote_size={}",
+            tenant_id, tenant_state, tenant_physical_size, tenant_remote_size
+        );
+
+        current_metrics.push((
+            BillingMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: BillingMetricKind::PhysicalSize,
+            },
+            tenant_physical_size,
+        ));
+
+        current_metrics.push((
+            BillingMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: BillingMetricKind::RemoteStorageSize,
+            },
+            tenant_remote_size,
+        ));
+
+        // TODO add SyntheticStorageSize metric
+    }
+
+    // Filter metrics
+    current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+        Some(val) => val != curr_val,
+        None => true,
+    });
+
+    if current_metrics.is_empty() {
+        trace!("no new metrics to send");
+        return Ok(());
+    }
+
+    // Send metrics.
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    const CHUNK_SIZE: usize = 1000;
+    let chunks = current_metrics.chunks(CHUNK_SIZE);
+
+    let mut chunk_to_send: Vec<BillingMetric> = Vec::with_capacity(1000);
+
+    for chunk in chunks {
+        chunk_to_send.clear();
+        // enrich metrics with timestamp and metric_kind before sending
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
+            BillingMetric::new_absolute(
+                curr_key.metric,
+                curr_key.tenant_id,
+                curr_key.timeline_id,
+                *curr_val,
+            )
+        }));
+
+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("BillingMetric should not fail serialization");
+
+        let res = client
+            .post(metric_collection_endpoint.clone())
+            .json(&chunk_json)
+            .send()
+            .await;
+
+        match res {
+            Ok(res) => {
+                if res.status().is_success() {
+                    // update cached metrics after they were sent successfully
+                    for (curr_key, curr_val) in chunk.iter() {
+                        cached_metrics.insert(curr_key.clone(), *curr_val);
+                    }
+                } else {
+                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                }
+            }
+            Err(err) => {
+                error!("failed to send metrics: {:?}", err);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 86ce318d0a..cc403ec2ea 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -310,6 +310,26 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 Ok(())
             },
         );
+
+        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            task_mgr::spawn(
+                MGMT_REQUEST_RUNTIME.handle(),
+                TaskKind::MetricsCollection,
+                None,
+                None,
+                "consumption metrics collection",
+                true,
+                async move {
+                    pageserver::billing_metrics::collect_metrics(
+                        metric_collection_endpoint,
+                        conf.metric_collection_interval,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                },
+            );
+        }
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 93c221e622..c6f417390f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,6 +12,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 
 use once_cell::sync::OnceCell;
+use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -55,6 +56,8 @@ pub mod defaults {
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "60 s";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     ///
     /// Default built-in configuration file.
     ///
@@ -78,6 +81,8 @@ pub mod defaults {
 
 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
 
+#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -144,6 +149,10 @@ pub struct PageServerConf {
     /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
 
+    // How often to collect metrics and send them to the metrics endpoint.
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<Url>,
+
     pub test_remote_failures: u64,
 }
 
@@ -224,6 +233,9 @@ struct PageServerConfigBuilder {
 
     concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
 
+    metric_collection_interval: BuilderValue<Duration>,
+    metric_collection_endpoint: BuilderValue<Option<Url>>,
+
     test_remote_failures: BuilderValue<u64>,
 }
 
@@ -260,6 +272,11 @@ impl Default for PageServerConfigBuilder {
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+            metric_collection_interval: Set(humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
             test_remote_failures: Set(0),
         }
@@ -342,6 +359,14 @@ impl PageServerConfigBuilder {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
 
+    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
+        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
+    }
+
+    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
+        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
+    }
+
     pub fn test_remote_failures(&mut self, fail_first: u64) {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
@@ -394,6 +419,12 @@ impl PageServerConfigBuilder {
                 .ok_or(anyhow!(
                     "missing concurrent_tenant_size_logical_size_queries"
                 ))?,
+            metric_collection_interval: self
+                .metric_collection_interval
+                .ok_or(anyhow!("missing metric_collection_interval"))?,
+            metric_collection_endpoint: self
+                .metric_collection_endpoint
+                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -568,6 +599,12 @@ impl PageServerConf {
                     let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
                     ConfigurableSemaphore::new(permits)
                 }),
+                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
+                "metric_collection_endpoint" => {
+                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
+                    builder.metric_collection_endpoint(Some(endpoint));
+                },
+
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
@@ -690,6 +727,8 @@ impl PageServerConf {
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            metric_collection_interval: Duration::from_secs(60),
+            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             test_remote_failures: 0,
         }
     }
@@ -821,6 +860,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10
 
+metric_collection_interval = '222 s'
+metric_collection_endpoint = 'http://localhost:80/metrics'
 log_format = 'json'
 
 "#;
@@ -864,6 +905,10 @@ log_format = 'json'
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: humantime::parse_duration(
+                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
+                )?,
+                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                 test_remote_failures: 0,
             },
             "Correct defaults should be used when no config values are provided"
@@ -909,6 +954,8 @@ log_format = 'json'
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: Duration::from_secs(222),
+                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 test_remote_failures: 0,
             },
             "Should be able to parse all basic config values correctly"
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5c4804db36..626d5e99e3 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,6 @@
 mod auth;
 pub mod basebackup;
+pub mod billing_metrics;
 pub mod config;
 pub mod http;
 pub mod import_datadir;
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 14ab332eba..9253b250cd 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -517,6 +517,10 @@ impl RemoteTimelineClient {
         self.metrics.remote_physical_size_gauge().set(size);
     }
 
+    pub fn get_remote_physical_size(&self) -> u64 {
+        self.metrics.remote_physical_size_gauge().get()
+    }
+
     //
     // Download operations.
     //
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 91719fb3af..fe3ad1a57d 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -203,6 +203,9 @@ pub enum TaskKind {
 
     // task that handles attaching a tenant
     Attach,
+
+    // task that handhes metrics collection
+    MetricsCollection,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a03f52155..0ff5089f66 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -700,6 +700,22 @@ impl Tenant {
         Ok(())
     }
 
+    /// get size of all remote timelines
+    ///
+    /// This function relies on the index_part instead of listing the remote storage
+    ///
+    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+        let mut size = 0;
+
+        for timeline in self.list_timelines().iter() {
+            if let Some(remote_client) = &timeline.remote_client {
+                size += remote_client.get_remote_physical_size();
+            }
+        }
+
+        Ok(size)
+    }
+
     #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
     async fn load_remote_timeline(
         &self,
diff --git a/poetry.lock b/poetry.lock
index 2fa7f03679..f5cbe24954 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -569,7 +569,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode-backport = ["unicodedata2"]
+unicode_backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -747,9 +747,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
+pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements-deprecated-finder = ["pip-api", "pipreqs"]
+requirements_deprecated_finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -824,7 +824,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-testing-libs = ["simplejson", "ujson", "yajl"]
+"testing.libs" = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -850,7 +850,7 @@ six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -1227,6 +1227,17 @@ pytest = ">=6.1.0"
 [package.extras]
 testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
 
+[[package]]
+name = "pytest-httpserver"
+version = "1.0.6"
+description = "pytest-httpserver is a httpserver for pytest"
+category = "main"
+optional = false
+python-versions = ">=3.7,<4.0"
+
+[package.dependencies]
+Werkzeug = ">=2.0.0"
+
 [[package]]
 name = "pytest-lazy-fixture"
 version = "0.6.3"
@@ -1350,7 +1361,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1583,7 +1594,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "98d63eaa73253882440e0fc8cdb305bb536944768c5ba313c25d0ee65f546544"
+content-hash = "55aba66810d5b47d25372c740e4d466e1e791c4d0e665c57a611ab8665563689"
 
 [metadata.files]
 aiopg = [
@@ -2099,7 +2110,18 @@ py = [
     {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
@@ -2157,6 +2179,10 @@ pytest-asyncio = [
     {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"},
     {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"},
 ]
+pytest-httpserver = [
+    {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"},
+    {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"},
+]
 pytest-lazy-fixture = [
     {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
     {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
diff --git a/pyproject.toml b/pyproject.toml
index b297f7f70b..4819ece4b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ toml = "^0.10.2"
 psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
+pytest-httpserver = "^1.0.6"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
new file mode 100644
index 0000000000..7f86d92962
--- /dev/null
+++ b/test_runner/regress/test_metric_collection.py
@@ -0,0 +1,138 @@
+import pytest
+from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PortDistributor,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import query_scalar
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
+num_metrics_received = 0
+remote_uploaded = 0
+
+
+#
+# verify that metrics look minilally sane
+#
+def metrics_handler(request: Request) -> Response:
+    if request.json is None:
+        return Response(status=400)
+
+    events = request.json["events"]
+    log.info("received events:")
+    log.info(events)
+
+    checks = {
+        "written_size": lambda value: value > 0,
+        "physical_size": lambda value: value >= 0,
+        # >= 0 check here is to avoid race condition when we receive metrics before
+        # remote_uploaded is updated
+        "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+    }
+
+    for event in events:
+        assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
+
+    assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+
+    global num_metrics_received
+    num_metrics_received += 1
+    return Response(status=200)
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
+)
+def test_metric_collection(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+    remote_storage_kind: RemoteStorageKind,
+):
+    (host, port) = httpserver_listen_address
+    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    # Disable time-based pitr, we will use the manual GC calls
+    # to trigger remote storage operations in a controlled way
+    neon_env_builder.pageserver_config_override = (
+        f"""
+    metric_collection_endpoint="{metric_collection_endpoint}"
+    """
+        + "tenant_config={pitr_interval = '0 sec'}"
+    )
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_metric_collection",
+    )
+
+    log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
+
+    # mock http server that returns OK for the metrics
+    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
+        metrics_handler
+    )
+
+    # spin up neon,  after http server is ready
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_metric_collection")
+    pg = env.postgres.create_start("test_metric_collection")
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # upload some data to remote storage
+    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http = env.pageserver.http_client()
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+        global remote_uploaded
+        remote_uploaded = get_num_remote_ops("index", "upload")
+        assert remote_uploaded > 0
+
+    # check that all requests are served
+    httpserver.check()
+    global num_metrics_received
+    assert num_metrics_received > 0, "no metrics were received"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index de9a26513d..6c81756fe1 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -16,6 +16,7 @@ publish = false
 ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
+chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] }
 clap = { version = "4", features = ["color", "derive", "error-context", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] }
 either = { version = "1", features = ["use_std"] }
@@ -36,9 +37,10 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
+reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
+serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 stable_deref_trait = { version = "1", features = ["alloc", "std"] }
 tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }

From 5d4774491f2e22c763dc045a35f799302d146007 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 21 Dec 2022 00:52:07 +0200
Subject: [PATCH 121/167] Exclude macOs fork files from tar processing (#3165)

When running tenant relocation tests, we use
https://github.com/neondatabase/neon/blob/main/scripts/export_import_between_pageservers.py
script to export and import basebackup between pageservers.

When pageserver runs on macOs and reuses the `tar` library for creating
the basebackup archive, it gets the fork files

https://superuser.com/questions/61185/why-do-i-get-files-like-foo-in-my-tarball-on-os-x

We might be able to fix our code to fix the issue, but if we get such
(valid) archive as an input, we
[fail](https://github.com/neondatabase/neon/pull/3013#issuecomment-1360093900).
This does not seem optimal, given that we can ignore such files.
---
 pageserver/src/import_datadir.rs | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 642e41765b..db83bdb3a1 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -440,16 +440,22 @@ fn import_file<Reader: Read>(
     reader: Reader,
     len: usize,
 ) -> Result<Option<ControlFileData>> {
+    let file_name = match file_path.file_name() {
+        Some(name) => name.to_string_lossy(),
+        None => return Ok(None),
+    };
+
+    if file_name.starts_with('.') {
+        // tar archives on macOs, created without COPYFILE_DISABLE=1 env var
+        // will contain "fork files", skip them.
+        return Ok(None);
+    }
+
     if file_path.starts_with("global") {
         let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
         let dbnode = 0;
 
-        match file_path
-            .file_name()
-            .expect("missing filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
             "pg_control" => {
                 let bytes = read_all_bytes(reader)?;
 
@@ -485,12 +491,7 @@ fn import_file<Reader: Read>(
             .to_string_lossy()
             .parse()?;
 
-        match file_path
-            .file_name()
-            .expect("missing base filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader)?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
@@ -520,11 +521,7 @@ fn import_file<Reader: Read>(
         import_slru(modification, slru, file_path, reader, len)?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
-        let file_name = &file_path
-            .file_name()
-            .expect("missing twophase filename")
-            .to_string_lossy();
-        let xid = u32::from_str_radix(file_name, 16)?;
+        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
         let bytes = read_all_bytes(reader)?;
         modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;

From 486a985629c9f8c908153522858469c667ce088d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Dec 2022 09:38:42 +0000
Subject: [PATCH 122/167] mypy: enable check_untyped_defs (#3142)

Enable `check_untyped_defs` and fix warnings.
---
 poetry.lock                                  | 87 ++++++++++----------
 pyproject.toml                               |  6 +-
 scripts/export_import_between_pageservers.py | 18 ++--
 test_runner/fixtures/compare_fixtures.py     |  2 +-
 test_runner/fixtures/neon_fixtures.py        | 18 ++--
 test_runner/performance/test_copy.py         |  3 +-
 test_runner/regress/test_compute_ctl.py      |  6 +-
 test_runner/regress/test_import.py           |  4 +-
 test_runner/regress/test_proxy.py            |  1 +
 test_runner/regress/test_wal_acceptor.py     |  3 +
 10 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f5cbe24954..1b04230cef 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -569,7 +569,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -747,9 +747,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -824,7 +824,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -850,7 +850,7 @@ six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -941,11 +941,11 @@ xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
 [[package]]
 name = "mypy"
-version = "0.971"
+version = "0.991"
 description = "Optional static typing for Python"
 category = "dev"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 
 [package.dependencies]
 mypy-extensions = ">=0.4.3"
@@ -954,6 +954,7 @@ typing-extensions = ">=3.10"
 
 [package.extras]
 dmypy = ["psutil (>=4.0)"]
+install-types = ["pip"]
 python2 = ["typed-ast (>=1.4.0,<2)"]
 reports = ["lxml"]
 
@@ -1361,7 +1362,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1594,7 +1595,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "55aba66810d5b47d25372c740e4d466e1e791c4d0e665c57a611ab8665563689"
+content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
 
 [metadata.files]
 aiopg = [
@@ -1960,29 +1961,36 @@ moto = [
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
 mypy = [
-    {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"},
-    {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"},
-    {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"},
-    {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"},
-    {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"},
-    {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"},
-    {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"},
-    {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"},
-    {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"},
-    {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"},
-    {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"},
-    {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"},
-    {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"},
-    {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"},
-    {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"},
-    {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"},
-    {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"},
-    {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"},
-    {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"},
-    {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"},
-    {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"},
-    {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"},
-    {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"},
+    {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
+    {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
+    {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"},
+    {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"},
+    {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"},
+    {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"},
+    {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"},
+    {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"},
+    {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"},
+    {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"},
+    {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"},
+    {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"},
+    {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"},
+    {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"},
+    {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"},
+    {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"},
+    {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"},
+    {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"},
+    {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"},
+    {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"},
+    {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"},
+    {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"},
+    {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"},
+    {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"},
+    {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"},
+    {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"},
+    {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"},
+    {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"},
+    {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"},
+    {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"},
 ]
 mypy-boto3-s3 = [
     {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"},
@@ -2110,18 +2118,7 @@ py = [
     {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
diff --git a/pyproject.toml b/pyproject.toml
index 4819ece4b0..b4fb7a9e7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ pytest-httpserver = "^1.0.6"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
-mypy = "==0.971"
+mypy = "==0.991"
 black = "^22.6.0"
 isort = "^5.10.1"
 
@@ -61,10 +61,8 @@ skip = [
 ]
 
 [tool.mypy]
-# mypy uses regex
 exclude = "^vendor/"
-# some tests don't typecheck when this flag is set
-check_untyped_defs = false
+check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
 mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 1734038661..8ea3f13bf5 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -448,15 +448,15 @@ def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
 
 
 def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    conn = psycopg2.connect(pageserver_connstr)
-    conn.autocommit = True
-    with conn.cursor() as cur:
-        cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-        cur.execute(cmd)
-        res = cur.fetchone()
-        prev_lsn = res[0]
-        last_lsn = res[1]
-    conn.close()
+    with closing(psycopg2.connect(pageserver_connstr)) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
+            cur.execute(cmd)
+            res = cur.fetchone()
+            assert res is not None
+            prev_lsn = res[0]
+            last_lsn = res[1]
 
     return last_lsn, prev_lsn
 
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index fa488c4446..be1f146735 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -177,7 +177,7 @@ class VanillaCompare(PgCompare):
         self.cur = self.conn.cursor()
 
     @property
-    def pg(self) -> PgProtocol:
+    def pg(self) -> VanillaPostgres:
         return self._pg
 
     @property
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 287f157d97..d52ca38447 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -623,6 +623,7 @@ class NeonEnvBuilder:
         return self.env
 
     def start(self):
+        assert self.env is not None, "environment is not already initialized, call init() first"
         self.env.start()
 
     def init_start(self) -> NeonEnv:
@@ -751,6 +752,11 @@ class NeonEnvBuilder:
             log.info("no remote storage was set up, skipping cleanup")
             return
 
+        # Making mypy happy with allowing only `S3Storage` further.
+        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
+        # so this line effectively a no-op
+        assert isinstance(self.remote_storage, S3Storage)
+
         if self.keep_remote_storage_contents:
             log.info("keep_remote_storage_contents skipping remote storage cleanup")
             return
@@ -766,7 +772,8 @@ class NeonEnvBuilder:
             Prefix=self.remote_storage_prefix,
         )
 
-        objects_to_delete = {"Objects": []}
+        # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case
+        objects_to_delete: Any = {"Objects": []}
         cnt = 0
         for item in pages.search("Contents"):
             # weirdly when nothing is found it returns [None]
@@ -781,16 +788,17 @@ class NeonEnvBuilder:
                     Bucket=self.remote_storage.bucket_name,
                     Delete=objects_to_delete,
                 )
-                objects_to_delete = dict(Objects=[])
+                objects_to_delete = {"Objects": []}
                 cnt += 1
 
         # flush rest
         if len(objects_to_delete["Objects"]):
             self.remote_storage_client.delete_objects(
-                Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete
+                Bucket=self.remote_storage.bucket_name,
+                Delete=objects_to_delete,
             )
 
-        log.info("deleted %s objects from remote storage", cnt)
+        log.info(f"deleted {cnt} objects from remote storage")
 
     def __enter__(self) -> "NeonEnvBuilder":
         return self
@@ -2772,7 +2780,7 @@ class NeonBroker:
         log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
         with open(self.logfile, "wb") as logfile:
             args = [
-                self.neon_binpath / "storage_broker",
+                str(self.neon_binpath / "storage_broker"),
                 f"--listen-addr={listen_addr}",
             ]
             self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py
index 01b2097112..a91c78e867 100644
--- a/test_runner/performance/test_copy.py
+++ b/test_runner/performance/test_copy.py
@@ -1,5 +1,6 @@
 from contextlib import closing
 from io import BufferedReader, RawIOBase
+from typing import Optional
 
 from fixtures.compare_fixtures import PgCompare
 
@@ -8,7 +9,7 @@ class CopyTestData(RawIOBase):
     def __init__(self, rows: int):
         self.rows = rows
         self.rownum = 0
-        self.linebuf = None
+        self.linebuf: Optional[bytes] = None
         self.ptr = 0
 
     def readable(self):
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index 74ee2a89d4..f973bd8e60 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -193,8 +193,8 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             timeout=10,
         )
     except TimeoutExpired as exc:
-        ctl_logs = exc.stderr.decode("utf-8")
-        log.info("compute_ctl output:\n" + ctl_logs)
+        ctl_logs = (exc.stderr or b"").decode("utf-8")
+        log.info("compute_ctl output:\n{ctl_logs}")
 
     with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
         start = "starting safekeepers syncing"
@@ -240,7 +240,7 @@ class ExternalProcessManager:
         with self.pid_file:
             try:
                 os.kill(self.pid, signal.SIGTERM)
-            except os.OsError as e:
+            except OSError as e:
                 if not self.path.is_file():
                     return
                 log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}")
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index fb1bc4839e..0388e24e98 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -53,10 +53,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     unpacked_base = os.path.join(basebackup_dir, "unpacked-base")
     corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar")
     os.mkdir(unpacked_base, 0o750)
-    subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base])
+    subprocess_capture(test_output_dir, ["tar", "-xf", base_tar, "-C", unpacked_base])
     os.remove(os.path.join(unpacked_base, "global/pg_control"))
     subprocess_capture(
-        str(test_output_dir),
+        test_output_dir,
         ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base),
         cwd=unpacked_base,
     )
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 4d2b63d360..bcea4d970c 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -71,6 +71,7 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
 
         log.info("sending session activation message")
         psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info)
+        assert psql.stdout is not None
         out = (await psql.stdout.read()).decode("utf-8").strip()
         assert out == "ok"
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3b72aba422..d88ed319b5 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -883,9 +883,12 @@ class SafekeeperEnv:
             raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}")
 
     def get_safekeeper_connstrs(self):
+        assert self.safekeepers is not None, "safekeepers are not initialized"
         return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers])
 
     def create_postgres(self):
+        assert self.tenant_id is not None, "tenant_id is not initialized"
+        assert self.timeline_id is not None, "tenant_id is not initialized"
         pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata")
         pg = ProposerPostgres(
             pgdata_dir,

From a3f0111726861aa7a758ead2861a66f052bd38b2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Dec 2022 19:43:06 +0100
Subject: [PATCH 123/167] LayerMap::search is actually infallible

Found this while investigating failure modes of on-demand download.

I think it's a nice cleanup.
---
 pageserver/benches/bench_layer_map.rs |  6 +++---
 pageserver/src/tenant/layer_map.rs    | 16 ++++++++--------
 pageserver/src/tenant/timeline.rs     |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6001377811..a0c38e1e3a 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -163,7 +163,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
     c.bench_function("captest_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
@@ -192,7 +192,7 @@ fn bench_from_real_project(c: &mut Criterion) {
     c.bench_function("real_map_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
@@ -238,7 +238,7 @@ fn bench_sequential(c: &mut Criterion) {
         // Run the search queries
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 19252ecf6e..0202ccfa6a 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -261,7 +261,7 @@ where
     /// contain the version, even if it's missing from the returned
     /// layer.
     ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
         // linear search
         // Find the latest image layer that covers the given key
         let mut latest_img: Option<Arc<L>> = None;
@@ -286,10 +286,10 @@ where
             assert!(img_lsn < end_lsn);
             if Lsn(img_lsn.0 + 1) == end_lsn {
                 // found exact match
-                return Ok(Some(SearchResult {
+                return Some(SearchResult {
                     layer: Arc::clone(l),
                     lsn_floor: img_lsn,
-                }));
+                });
             }
             if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
                 latest_img = Some(Arc::clone(l));
@@ -346,19 +346,19 @@ where
                 Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
                 l.get_lsn_range().start,
             );
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                 lsn_floor,
                 layer: l,
-            }))
+            })
         } else if let Some(l) = latest_img {
             trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                 lsn_floor: latest_img_lsn.unwrap(),
                 layer: l,
-            }))
+            })
         } else {
             trace!("no layer found for request on {key} at {end_lsn}");
-            Ok(None)
+            None
         }
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0697ec4bd6..4a54c91d25 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1587,7 +1587,7 @@ impl Timeline {
                 }
             }
 
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? {
+            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
 
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);

From f637f6e77e035215517603bd8f6f8e74bcb9f675 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Dec 2022 12:20:53 +0100
Subject: [PATCH 124/167] stop exposing non-incremental sizes in API spec

Console doesn't use them, so, don't expose them.

refs https://github.com/neondatabase/cloud/pull/3358
refs https://github.com/neondatabase/cloud/pull/3366
---
 pageserver/src/http/openapi_spec.yml | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 67cf4ea326..f9b8a81dad 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -77,16 +77,6 @@ paths:
         schema:
           type: string
           format: hex
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
-      - name: include-non-incremental-physical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_physical_size_non_incremental
     get:
       description: Get timelines for tenant
       responses:
@@ -139,17 +129,6 @@ paths:
           format: hex
     get:
       description: Get info about the timeline
-      parameters:
-        - name: include-non-incremental-logical-size
-          in: query
-          schema:
-            type: string
-          description: Controls calculation of current_logical_size_non_incremental
-        - name: include-non-incremental-physical-size
-          in: query
-          schema:
-            type: string
-            description: Controls calculation of current_physical_size_non_incremental
       responses:
         "200":
           description: TimelineInfo
@@ -779,10 +758,6 @@ components:
           type: integer
         current_physical_size:
           type: integer
-        current_logical_size_non_incremental:
-          type: integer
-        current_physical_size_non_incremental:
-          type: integer
         wal_source_connstr:
           type: string
         last_received_msg_lsn:

From 91e89371121698e1e0522cc8374d393b94480e65 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 06:38:13 -0500
Subject: [PATCH 125/167] no-op: add Timeline::myself member

---
 pageserver/src/tenant.rs          |  10 +--
 pageserver/src/tenant/timeline.rs | 107 ++++++++++++++++--------------
 2 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0ff5089f66..ce05d8f085 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -480,7 +480,7 @@ impl Tenant {
             let timeline = UninitializedTimeline {
                 owning_tenant: self,
                 timeline_id,
-                raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
+                raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
             };
             // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote
             // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
@@ -510,7 +510,7 @@ impl Tenant {
                         )
                         })?;
                     broken_timeline.set_state(TimelineState::Broken);
-                    timelines_accessor.insert(timeline_id, Arc::new(broken_timeline));
+                    timelines_accessor.insert(timeline_id, broken_timeline);
                     Err(e)
                 }
             }
@@ -1647,7 +1647,7 @@ impl Tenant {
         new_metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
         if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
             anyhow::ensure!(
                 ancestor.is_some(),
@@ -2209,7 +2209,7 @@ impl Tenant {
                 Ok(UninitializedTimeline {
                     owning_tenant: self,
                     timeline_id: new_timeline_id,
-                    raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
+                    raw_timeline: Some((new_timeline, uninit_mark)),
                 })
             }
             Err(e) => {
@@ -2227,7 +2227,7 @@ impl Tenant {
         new_metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_data = self
             .create_timeline_data(
                 new_timeline_id,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a54c91d25..e891caa6f8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,7 +17,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::storage_sync::index::IndexPart;
@@ -76,6 +76,8 @@ pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
 
+    _myself: Weak<Self>,
+
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
 
@@ -748,75 +750,78 @@ impl Timeline {
         walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
         remote_client: Option<RemoteTimelineClient>,
         pg_version: u32,
-    ) -> Self {
+    ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(TimelineState::Suspended);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
-        let mut result = Timeline {
-            conf,
-            tenant_conf,
-            timeline_id,
-            tenant_id,
-            pg_version,
-            layers: RwLock::new(LayerMap::default()),
+        Arc::new_cyclic(|myself| {
+            let mut result = Timeline {
+                conf,
+                tenant_conf,
+                _myself: myself.clone(),
+                timeline_id,
+                tenant_id,
+                pg_version,
+                layers: RwLock::new(LayerMap::default()),
 
-            walredo_mgr,
+                walredo_mgr,
 
-            remote_client: remote_client.map(Arc::new),
+                remote_client: remote_client.map(Arc::new),
 
-            // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
-            last_record_lsn: SeqWait::new(RecordLsn {
-                last: disk_consistent_lsn,
-                prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
-            }),
-            disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
+                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
+                last_record_lsn: SeqWait::new(RecordLsn {
+                    last: disk_consistent_lsn,
+                    prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
+                }),
+                disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
 
-            last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
-            last_freeze_ts: RwLock::new(Instant::now()),
+                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
+                last_freeze_ts: RwLock::new(Instant::now()),
 
-            ancestor_timeline: ancestor,
-            ancestor_lsn: metadata.ancestor_lsn(),
+                ancestor_timeline: ancestor,
+                ancestor_lsn: metadata.ancestor_lsn(),
 
-            metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
+                metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
 
-            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
+                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
-            layer_flush_start_tx,
-            layer_flush_done_tx,
+                layer_flush_start_tx,
+                layer_flush_done_tx,
 
-            write_lock: Mutex::new(()),
-            layer_removal_cs: Default::default(),
+                write_lock: Mutex::new(()),
+                layer_removal_cs: Default::default(),
 
-            gc_info: RwLock::new(GcInfo {
-                retain_lsns: Vec::new(),
-                horizon_cutoff: Lsn(0),
-                pitr_cutoff: Lsn(0),
-            }),
+                gc_info: RwLock::new(GcInfo {
+                    retain_lsns: Vec::new(),
+                    horizon_cutoff: Lsn(0),
+                    pitr_cutoff: Lsn(0),
+                }),
 
-            latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
-            initdb_lsn: metadata.initdb_lsn(),
+                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
+                initdb_lsn: metadata.initdb_lsn(),
 
-            current_logical_size: if disk_consistent_lsn.is_valid() {
-                // we're creating timeline data with some layer files existing locally,
-                // need to recalculate timeline's logical size based on data in the layers.
-                LogicalSize::deferred_initial(disk_consistent_lsn)
-            } else {
-                // we're creating timeline data without any layers existing locally,
-                // initial logical size is 0.
-                LogicalSize::empty_initial()
-            },
-            partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
-            repartition_threshold: 0,
+                current_logical_size: if disk_consistent_lsn.is_valid() {
+                    // we're creating timeline data with some layer files existing locally,
+                    // need to recalculate timeline's logical size based on data in the layers.
+                    LogicalSize::deferred_initial(disk_consistent_lsn)
+                } else {
+                    // we're creating timeline data without any layers existing locally,
+                    // initial logical size is 0.
+                    LogicalSize::empty_initial()
+                },
+                partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                repartition_threshold: 0,
 
-            last_received_wal: Mutex::new(None),
-            rel_size_cache: RwLock::new(HashMap::new()),
-            state,
-        };
-        result.repartition_threshold = result.get_checkpoint_distance() / 10;
-        result
+                last_received_wal: Mutex::new(None),
+                rel_size_cache: RwLock::new(HashMap::new()),
+                state,
+            };
+            result.repartition_threshold = result.get_checkpoint_distance() / 10;
+            result
+        })
     }
 
     pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {

From f5b424b96cee32b80cbee43020628d5feb2e57df Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 7 Dec 2022 12:52:25 -0500
Subject: [PATCH 126/167] no-op: type aliases for Layer::iter and
 Layer::key_iter return types

Not needed by anything right now, but the next commit adds a `Result<>`
around iter() and key_iter()'s return types, and that makes clippy
complain.
---
 pageserver/src/tenant/delta_layer.rs   |  6 +++---
 pageserver/src/tenant/image_layer.rs   |  6 +++---
 pageserver/src/tenant/storage_layer.rs | 10 ++++++++--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index d8aaa3e8b9..cff819d878 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -55,7 +55,7 @@ use utils::{
 };
 
 use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::storage_layer::{Layer, LayerIter, LayerKeyIter};
 
 ///
 /// Header stored in the beginning of the file
@@ -391,7 +391,7 @@ impl PersistentLayer for DeltaLayer {
         self.path()
     }
 
-    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
+    fn iter(&self) -> LayerIter<'_> {
         let inner = match self.load() {
             Ok(inner) => inner,
             Err(e) => panic!("Failed to load a delta layer: {e:?}"),
@@ -403,7 +403,7 @@ impl PersistentLayer for DeltaLayer {
         }
     }
 
-    fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
+    fn key_iter(&self) -> LayerKeyIter<'_> {
         let inner = match self.load() {
             Ok(inner) => inner,
             Err(e) => panic!("Failed to load a delta layer: {e:?}"),
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index e08e938a4f..fe9de855e7 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -21,7 +21,7 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, Value, KEY_SIZE};
+use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
@@ -51,7 +51,7 @@ use utils::{
 };
 
 use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::storage_layer::{Layer, LayerIter};
 
 ///
 /// Header stored in the beginning of the file
@@ -219,7 +219,7 @@ impl PersistentLayer for ImageLayer {
     fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+    fn iter(&self) -> LayerIter<'_> {
         unimplemented!();
     }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3ad62587d3..82c25c063b 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -116,6 +116,12 @@ pub trait Layer: Send + Sync {
     fn dump(&self, verbose: bool) -> Result<()>;
 }
 
+/// Returned by [`Layer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+
+/// Returned by [`Layer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -144,11 +150,11 @@ pub trait PersistentLayer: Layer {
     fn local_path(&self) -> PathBuf;
 
     /// Iterate through all keys and values stored in the layer
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
+    fn iter(&self) -> LayerIter<'_>;
 
     /// Iterate through all keys stored in the layer. Returns key, lsn and value size
     /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
+    fn key_iter(&self) -> LayerKeyIter<'_> {
         panic!("Not implemented")
     }
 

From e94b4514301501ae43330c38e7467b88901c7dd8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 06:43:26 -0500
Subject: [PATCH 127/167] no-op: storage_layer::Iter::{iter, key_iter}: make
 them fallible

---
 pageserver/src/tenant/delta_layer.rs    | 27 ++++--------
 pageserver/src/tenant/image_layer.rs    |  2 +-
 pageserver/src/tenant/inmemory_layer.rs |  1 +
 pageserver/src/tenant/storage_layer.rs  |  4 +-
 pageserver/src/tenant/timeline.rs       | 56 +++++++++++++------------
 5 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index cff819d878..a252abf2a0 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -391,28 +391,19 @@ impl PersistentLayer for DeltaLayer {
         self.path()
     }
 
-    fn iter(&self) -> LayerIter<'_> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaValueIter::new(inner) {
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        let inner = self.load().context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
             Ok(iter) => Box::new(iter),
             Err(err) => Box::new(std::iter::once(Err(err))),
-        }
+        })
     }
 
-    fn key_iter(&self) -> LayerKeyIter<'_> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaKeyIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(e) => panic!("Layer index is corrupted: {e:?}"),
-        }
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load()?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
     }
 
     fn delete(&self) -> Result<()> {
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index fe9de855e7..c907d21af5 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -219,7 +219,7 @@ impl PersistentLayer for ImageLayer {
     fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
-    fn iter(&self) -> LayerIter<'_> {
+    fn iter(&self) -> Result<LayerIter<'_>> {
         unimplemented!();
     }
 
diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs
index 8f64281cb1..35b0e98591 100644
--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/inmemory_layer.rs
@@ -97,6 +97,7 @@ impl Layer for InMemoryLayer {
         };
         self.start_lsn..end_lsn
     }
+
     fn is_incremental(&self) -> bool {
         // in-memory layer is always considered incremental.
         true
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 82c25c063b..ba0311574d 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -150,11 +150,11 @@ pub trait PersistentLayer: Layer {
     fn local_path(&self) -> PathBuf;
 
     /// Iterate through all keys and values stored in the layer
-    fn iter(&self) -> LayerIter<'_>;
+    fn iter(&self) -> Result<LayerIter<'_>>;
 
     /// Iterate through all keys stored in the layer. Returns key, lsn and value size
     /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self) -> LayerKeyIter<'_> {
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
         panic!("Not implemented")
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e891caa6f8..34cb01cdd8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2195,38 +2195,40 @@ impl Timeline {
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
-        let all_values_iter = deltas_to_compact
-            .iter()
-            .map(|l| l.iter())
-            .kmerge_by(|a, b| {
-                if let Ok((a_key, a_lsn, _)) = a {
-                    if let Ok((b_key, b_lsn, _)) = b {
-                        match a_key.cmp(b_key) {
-                            Ordering::Less => true,
-                            Ordering::Equal => a_lsn <= b_lsn,
-                            Ordering::Greater => false,
+        let all_values_iter =
+            itertools::process_results(deltas_to_compact.iter().map(|l| l.iter()), |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    if let Ok((a_key, a_lsn, _)) = a {
+                        if let Ok((b_key, b_lsn, _)) = b {
+                            match a_key.cmp(b_key) {
+                                Ordering::Less => true,
+                                Ordering::Equal => a_lsn <= b_lsn,
+                                Ordering::Greater => false,
+                            }
+                        } else {
+                            false
                         }
                     } else {
-                        false
+                        true
                     }
-                } else {
-                    true
-                }
-            });
+                })
+            })?;
 
         // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = deltas_to_compact
-            .iter()
-            .map(|l| l.key_iter())
-            .kmerge_by(|a, b| {
-                let (a_key, a_lsn, _) = a;
-                let (b_key, b_lsn, _) = b;
-                match a_key.cmp(b_key) {
-                    Ordering::Less => true,
-                    Ordering::Equal => a_lsn <= b_lsn,
-                    Ordering::Greater => false,
-                }
-            });
+        let mut all_keys_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter()),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    match a_key.cmp(b_key) {
+                        Ordering::Less => true,
+                        Ordering::Equal => a_lsn <= b_lsn,
+                        Ordering::Greater => false,
+                    }
+                })
+            },
+        )?;
 
         // Merge the contents of all the input delta layers into a new set
         // of delta layers, based on the current partitioning.

From 749a2f00d71750a9055b21e8b565a3e9b51eecd5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 06:45:10 -0500
Subject: [PATCH 128/167] no-op: distinguished error types for
 Timeline::get_reconstruct_data

---
 pageserver/src/tenant/timeline.rs | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 34cb01cdd8..0b31c9bdc4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1483,7 +1483,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PageReconstructError> {
         // Start from the current timeline.
         let mut timeline_owned;
         let mut timeline = self;
@@ -2828,12 +2828,31 @@ impl Timeline {
     }
 }
 
+/// An error happened in a get() operation.
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+
+    #[error(transparent)]
+    WalRedo(#[from] crate::walredo::WalRedoError),
+}
+
+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        match self {
+            PageReconstructError::Other(err) => err.fmt(f),
+            PageReconstructError::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
 /// Helper function for get_reconstruct_data() to add the path of layers traversed
 /// to an error, as anyhow context information.
 fn layer_traversal_error(
     msg: String,
     path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
-) -> anyhow::Result<()> {
+) -> Result<(), PageReconstructError> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
@@ -2849,7 +2868,8 @@ fn layer_traversal_error(
     let err = anyhow!(msg_iter.next().unwrap());
 
     // Append all subsequent traversals, and the error message 'msg', as contexts.
-    Err(msg_iter.fold(err, |err, msg| err.context(msg)))
+    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
+    Err(PageReconstructError::Other(msg))
 }
 
 /// Various functions to mutate the timeline.

From 24609873287823a7d208e14cd227041c3c0bcc12 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 09:38:41 -0500
Subject: [PATCH 129/167] no-op: pgdatadir_mapping: qualified use of
 anyhow::Result

---
 pageserver/src/pgdatadir_mapping.rs | 107 ++++++++++++++++------------
 1 file changed, 63 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 797ee9f436..7b4b05ed18 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{self, bail, ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -97,7 +97,7 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
-    ) -> Result<Bytes> {
+    ) -> anyhow::Result<Bytes> {
         ensure!(tag.relnode != 0, "invalid relnode");
 
         let nblocks = self.get_rel_size(tag, lsn, latest)?;
@@ -114,7 +114,13 @@ impl Timeline {
     }
 
     // Get size of a database in blocks
-    pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
+    pub fn get_db_size(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+        latest: bool,
+    ) -> anyhow::Result<usize> {
         let mut total_blocks = 0;
 
         let rels = self.list_rels(spcnode, dbnode, lsn)?;
@@ -127,7 +133,7 @@ impl Timeline {
     }
 
     /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
+    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> anyhow::Result<BlockNumber> {
         ensure!(tag.relnode != 0, "invalid relnode");
 
         if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -162,7 +168,7 @@ impl Timeline {
     }
 
     /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
+    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> anyhow::Result<bool> {
         ensure!(tag.relnode != 0, "invalid relnode");
 
         // first try to lookup relation in cache
@@ -180,7 +186,12 @@ impl Timeline {
     }
 
     /// Get a list of all existing relations in given tablespace and database.
-    pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
+    pub fn list_rels(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> anyhow::Result<HashSet<RelTag>> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
         let buf = self.get(key, lsn)?;
@@ -204,7 +215,7 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
-    ) -> Result<Bytes> {
+    ) -> anyhow::Result<Bytes> {
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn)
     }
@@ -215,14 +226,19 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> Result<BlockNumber> {
+    ) -> anyhow::Result<BlockNumber> {
         let key = slru_segment_size_to_key(kind, segno);
         let mut buf = self.get(key, lsn)?;
         Ok(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
-    pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
+    pub fn get_slru_segment_exists(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+    ) -> anyhow::Result<bool> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
         let buf = self.get(key, lsn)?;
@@ -239,7 +255,10 @@ impl Timeline {
     /// so it's not well defined which LSN you get if there were multiple commits
     /// "in flight" at that point in time.
     ///
-    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+    pub fn find_lsn_for_timestamp(
+        &self,
+        search_timestamp: TimestampTz,
+    ) -> anyhow::Result<LsnForTimestamp> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
         let max_lsn = self.get_last_record_lsn();
@@ -308,7 +327,7 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
-    ) -> Result<bool> {
+    ) -> anyhow::Result<bool> {
         for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
             let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
             for blknum in (0..nblocks).rev() {
@@ -333,7 +352,7 @@ impl Timeline {
     }
 
     /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
+    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> anyhow::Result<HashSet<u32>> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
@@ -343,14 +362,14 @@ impl Timeline {
         Ok(dir.segments)
     }
 
-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> anyhow::Result<Bytes> {
         let key = relmap_file_key(spcnode, dbnode);
 
         let buf = self.get(key, lsn)?;
         Ok(buf)
     }
 
-    pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> anyhow::Result<HashMap<(Oid, Oid), bool>> {
         // fetch directory entry
         let buf = self.get(DBDIR_KEY, lsn)?;
         let dir = DbDirectory::des(&buf)?;
@@ -358,13 +377,13 @@ impl Timeline {
         Ok(dir.dbdirs)
     }
 
-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<Bytes> {
         let key = twophase_file_key(xid);
         let buf = self.get(key, lsn)?;
         Ok(buf)
     }
 
-    pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> anyhow::Result<HashSet<TransactionId>> {
         // fetch directory entry
         let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
         let dir = TwoPhaseDirectory::des(&buf)?;
@@ -372,11 +391,11 @@ impl Timeline {
         Ok(dir.xids)
     }
 
-    pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
         self.get(CONTROLFILE_KEY, lsn)
     }
 
-    pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
         self.get(CHECKPOINT_KEY, lsn)
     }
 
@@ -414,7 +433,7 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
+    pub fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -553,7 +572,7 @@ impl<'a> DatadirModification<'a> {
     ///
     /// This inserts the directory metadata entries that are assumed to
     /// always exist.
-    pub fn init_empty(&mut self) -> Result<()> {
+    pub fn init_empty(&mut self) -> anyhow::Result<()> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
@@ -586,7 +605,7 @@ impl<'a> DatadirModification<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
@@ -599,7 +618,7 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         self.put(
             slru_block_to_key(kind, segno, blknum),
             Value::WalRecord(rec),
@@ -613,7 +632,7 @@ impl<'a> DatadirModification<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
@@ -625,13 +644,13 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
         Ok(())
     }
 
     /// Store a relmapper file (pg_filenode.map) in the repository
-    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
+    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY)?;
         let mut dbdir = DbDirectory::des(&buf)?;
@@ -659,7 +678,7 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
+    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory entry
         let buf = self.get(TWOPHASEDIR_KEY)?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
@@ -675,17 +694,17 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CHECKPOINT_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
+    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
         let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
@@ -714,7 +733,7 @@ impl<'a> DatadirModification<'a> {
     /// Create a relation fork.
     ///
     /// 'nblocks' is the initial size.
-    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
@@ -758,7 +777,7 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Truncate relation
-    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
         if self.tline.get_rel_exists(rel, last_lsn, true)? {
@@ -784,7 +803,7 @@ impl<'a> DatadirModification<'a> {
 
     /// Extend relation
     /// If new size is smaller, do nothing.
-    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
@@ -805,7 +824,7 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
+    pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
@@ -838,7 +857,7 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key)?;
@@ -868,7 +887,7 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Put size
         let size_key = slru_segment_size_to_key(kind, segno);
         let buf = nblocks.to_le_bytes();
@@ -877,7 +896,7 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
+    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key)?;
@@ -898,13 +917,13 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
         // TODO
         Ok(())
     }
 
     /// This method is used for marking truncated SLRU files
-    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
+    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY)?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
@@ -941,7 +960,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub fn flush(&mut self) -> Result<()> {
+    pub fn flush(&mut self) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -952,7 +971,7 @@ impl<'a> DatadirModification<'a> {
         let writer = self.tline.writer();
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: Result<()> = Ok(());
+        let mut result: anyhow::Result<()> = Ok(());
         self.pending_updates.retain(|&key, value| {
             if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
                 result = writer.put(key, self.lsn, value);
@@ -1000,7 +1019,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    fn get(&self, key: Key) -> Result<Bytes> {
+    fn get(&self, key: Key) -> anyhow::Result<Bytes> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1370,7 +1389,7 @@ const CHECKPOINT_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
-pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
             RelTag {
@@ -1400,7 +1419,7 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
         && key.field6 != 0xffffffff
 }
 
-pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
     Ok(match key.field1 {
         0x01 => {
             let kind = match key.field2 {
@@ -1429,7 +1448,7 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
-) -> Result<std::sync::Arc<Timeline>> {
+) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
         .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
         .initialize()?;

From 1da03141a74a649f35dc7f9992e050bb9f4d9db3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Dec 2022 15:16:08 -0500
Subject: [PATCH 130/167] refactor: make Layer::local_path return
 Option<PathBuf> instead of PathBuf

This is in preparation for RemoteLayer, which by definition doesn't have
a local path.
---
 pageserver/src/tenant/delta_layer.rs   |  4 +-
 pageserver/src/tenant/image_layer.rs   |  4 +-
 pageserver/src/tenant/storage_layer.rs |  3 +-
 pageserver/src/tenant/timeline.rs      | 54 +++++++++++++++++---------
 4 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index a252abf2a0..e1006dfe00 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -387,8 +387,8 @@ impl PersistentLayer for DeltaLayer {
         self.layer_name().into()
     }
 
-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
     }
 
     fn iter(&self) -> Result<LayerIter<'_>> {
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index c907d21af5..b1dbbfb683 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -208,8 +208,8 @@ impl PersistentLayer for ImageLayer {
         self.layer_name().into()
     }
 
-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
     }
 
     fn get_tenant_id(&self) -> TenantId {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index ba0311574d..79eaa96591 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -147,7 +147,8 @@ pub trait PersistentLayer: Layer {
     fn filename(&self) -> LayerFileName;
 
     // Path to the layer file in the local filesystem.
-    fn local_path(&self) -> PathBuf;
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<PathBuf>;
 
     /// Iterate through all keys and values stored in the layer
     fn iter(&self) -> Result<LayerIter<'_>>;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0b31c9bdc4..59d3486644 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1031,11 +1031,13 @@ impl Timeline {
                 .unwrap_or(LayerFileMetadata::MISSING);
 
             // Is the local layer's size different from the size stored in the
-            // remote index file? If so, rename_to_backup those files & remove
-            // local_layer form the layer map.
-            // We'll download a fresh copy of the layer file below.
+            // remote index file?
+            // If so, rename_to_backup those files & replace their local layer with
+            // a RemoteLayer in the laye rmap so that we re-download them on-demand.
             if let Some(local_layer) = local_layer {
-                let local_layer_path = local_layer.local_path();
+                let local_layer_path = local_layer
+                    .local_path()
+                    .expect("caller must ensure that local_layers only contains local layers");
                 ensure!(
                     local_layer_path.exists(),
                     "every layer from local_layers must exist on disk: {}",
@@ -1210,7 +1212,10 @@ impl Timeline {
 
         // Are there local files that don't exist remotely? Schedule uploads for them
         for (layer_name, layer) in &local_only_layers {
-            let layer_path = layer.local_path();
+            // XXX solve this in the type system
+            let layer_path = layer
+                .local_path()
+                .expect("local_only_layers only contains local layers");
             let layer_size = layer_path
                 .metadata()
                 .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
@@ -1450,12 +1455,21 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
     fn traversal_id(&self) -> String {
-        debug_assert!(
-            self.local_path().to_str().unwrap()
-                .contains(&format!("{}", self.get_timeline_id())),
-            "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
-        );
-        format!("{}", self.local_path().display())
+        match self.local_path() {
+            Some(local_path) => {
+                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
+                    "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
+                );
+                format!("{}", local_path.display())
+            }
+            None => {
+                format!(
+                    "remote {}/{}",
+                    self.get_timeline_id(),
+                    self.filename().file_name()
+                )
+            }
+        }
     }
 }
 
@@ -2440,10 +2454,11 @@ impl Timeline {
         // delete the old ones
         let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
-            let path = l.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
+            if let Some(path) = l.local_path() {
+                self.metrics
+                    .current_physical_size_gauge
+                    .sub(path.metadata()?.len());
+            }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
             layers.remove_historic(l);
@@ -2726,10 +2741,11 @@ impl Timeline {
             // while iterating it. BTreeMap::retain() would be another option)
             let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
             for doomed_layer in layers_to_remove {
-                let path = doomed_layer.local_path();
-                self.metrics
-                    .current_physical_size_gauge
-                    .sub(path.metadata()?.len());
+                if let Some(path) = doomed_layer.local_path() {
+                    self.metrics
+                        .current_physical_size_gauge
+                        .sub(path.metadata()?.len());
+                }
                 layer_names_to_delete.push(doomed_layer.filename());
                 doomed_layer.delete()?;
                 layers.remove_historic(doomed_layer);

From 31543c4acc330060712036ae22651995c2b29a28 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 Dec 2022 11:20:04 -0500
Subject: [PATCH 131/167] refactor: make update_gc_info and transitive callers
 async

This is so that in the next commit, we can add a retry_get to
find_lsn_for_timestamp.
---
 pageserver/src/tenant.rs          | 134 +++++++++++++++++++-----------
 pageserver/src/tenant/size.rs     |   1 +
 pageserver/src/tenant/timeline.rs |   6 +-
 3 files changed, 88 insertions(+), 53 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce05d8f085..799a34fb3b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -129,7 +129,7 @@ pub struct Tenant {
     // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
-    gc_cs: Mutex<()>,
+    gc_cs: tokio::sync::Mutex<()>,
     walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
 
     // provides access to timeline data sitting in the remote storage
@@ -1158,7 +1158,8 @@ impl Tenant {
                     ancestor_timeline.wait_lsn(*lsn).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
+                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                    .await?
             }
             None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
@@ -1683,7 +1684,7 @@ impl Tenant {
             conf,
             tenant_conf: Arc::new(RwLock::new(tenant_conf)),
             timelines: Mutex::new(HashMap::new()),
-            gc_cs: Mutex::new(()),
+            gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
             state,
@@ -1834,7 +1835,9 @@ impl Tenant {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
-        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
+        let gc_timelines = self
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
@@ -1869,7 +1872,7 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1880,54 +1883,60 @@ impl Tenant {
         let target_timeline_id = None;
 
         self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await
     }
 
-    fn refresh_gc_info_internal(
+    async fn refresh_gc_info_internal(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
-        let gc_cs = self.gc_cs.lock().unwrap();
-
-        let timelines = self.timelines.lock().unwrap();
+        let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new();
-        let timeline_ids = {
-            if let Some(target_timeline_id) = target_timeline_id.as_ref() {
-                if timelines.get(target_timeline_id).is_none() {
-                    bail!("gc target timeline does not exist")
-                }
-            };
+        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+            let timelines = self.timelines.lock().unwrap();
+            let mut all_branchpoints = BTreeSet::new();
+            let timeline_ids = {
+                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
+                    if timelines.get(target_timeline_id).is_none() {
+                        bail!("gc target timeline does not exist")
+                    }
+                };
 
-            timelines
-                .iter()
-                .map(|(timeline_id, timeline_entry)| {
-                    if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
-                        // If target_timeline is specified, we only need to know branchpoints of its children
-                        if let Some(timeline_id) = target_timeline_id {
-                            if ancestor_timeline_id == &timeline_id {
+                timelines
+                    .iter()
+                    .map(|(timeline_id, timeline_entry)| {
+                        if let Some(ancestor_timeline_id) =
+                            &timeline_entry.get_ancestor_timeline_id()
+                        {
+                            // If target_timeline is specified, we only need to know branchpoints of its children
+                            if let Some(timeline_id) = target_timeline_id {
+                                if ancestor_timeline_id == &timeline_id {
+                                    all_branchpoints.insert((
+                                        *ancestor_timeline_id,
+                                        timeline_entry.get_ancestor_lsn(),
+                                    ));
+                                }
+                            }
+                            // Collect branchpoints for all timelines
+                            else {
                                 all_branchpoints.insert((
                                     *ancestor_timeline_id,
                                     timeline_entry.get_ancestor_lsn(),
                                 ));
                             }
                         }
-                        // Collect branchpoints for all timelines
-                        else {
-                            all_branchpoints
-                                .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn()));
-                        }
-                    }
 
-                    *timeline_id
-                })
-                .collect::<Vec<_>>()
+                        *timeline_id
+                    })
+                    .collect::<Vec<_>>()
+            };
+            (all_branchpoints, timeline_ids)
         };
-        drop(timelines);
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
@@ -1953,7 +1962,7 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr)?;
+                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
 
                 gc_timelines.push(timeline);
             }
@@ -1963,7 +1972,7 @@ impl Tenant {
     }
 
     /// Branch an existing timeline
-    fn branch_timeline(
+    async fn branch_timeline(
         &self,
         src: TimelineId,
         dst: TimelineId,
@@ -1972,10 +1981,11 @@ impl Tenant {
         // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
         // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
         // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().unwrap();
-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?;
-        drop(timelines);
+        let _gc_cs = self.gc_cs.lock().await;
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst, &timelines)?
+        };
 
         // In order for the branch creation task to not wait for GC/compaction,
         // we need to make sure that the starting LSN of the child branch is not out of scope midway by
@@ -2837,7 +2847,9 @@ mod tests {
         //assert_current_logical_size(&tline, Lsn(0x40));
 
         // Branch the history, modify relation differently on the new timeline
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -2925,7 +2937,10 @@ mod tests {
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
                 assert!(err.to_string().contains("invalid branch start lsn"));
@@ -2950,7 +2965,10 @@ mod tests {
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
             .initialize()?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
                 assert!(&err.to_string().contains("invalid branch start lsn"));
@@ -2998,7 +3016,9 @@ mod tests {
             .initialize()?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -3020,7 +3040,9 @@ mod tests {
             .initialize()?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -3074,7 +3096,9 @@ mod tests {
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-            tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+            tenant
+                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .await?;
 
             let newtline = tenant
                 .get_timeline(NEW_TIMELINE_ID, true)
@@ -3225,7 +3249,9 @@ mod tests {
 
             let cutoff = tline.get_last_record_lsn();
 
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
             tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
@@ -3296,7 +3322,9 @@ mod tests {
 
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
             tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
@@ -3345,7 +3373,9 @@ mod tests {
         let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
@@ -3378,7 +3408,9 @@ mod tests {
 
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
             tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
@@ -3409,7 +3441,9 @@ mod tests {
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 597461ce29..5ce0837562 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -70,6 +70,7 @@ pub(super) async fn gather_inputs(
 
     let timelines = tenant
         .refresh_gc_info()
+        .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
     if timelines.is_empty() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 59d3486644..61d619a17b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -160,7 +160,7 @@ pub struct Timeline {
 
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
-    pub gc_info: RwLock<GcInfo>,
+    pub gc_info: std::sync::RwLock<GcInfo>,
 
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
@@ -794,7 +794,7 @@ impl Timeline {
                 write_lock: Mutex::new(()),
                 layer_removal_cs: Default::default(),
 
-                gc_info: RwLock::new(GcInfo {
+                gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
                     horizon_cutoff: Lsn(0),
                     pitr_cutoff: Lsn(0),
@@ -2499,7 +2499,7 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
-    pub(super) fn update_gc_info(
+    pub(super) async fn update_gc_info(
         &self,
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,

From 7ff591ffbfc8f084e0b1b5cdbed8bd69e008d4c0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 5 Dec 2022 10:20:24 -0500
Subject: [PATCH 132/167] On-Demand Download
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code in this change was extracted from #2595 (Heikki’s on-demand
download draft PR).

High-Level Changes

- New RemoteLayer Type
- On-Demand Download As An Effect Of Page Reconstruction
- Breaking Semantics For Physical Size Metrics

There are several follow-up work items planned.
Refer to the Epic issue on GitHub: https://github.com/neondatabase/neon/issues/2029

closes https://github.com/neondatabase/neon/pull/3013

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>

New RemoteLayer Type
====================

Instead of downloading all layers during tenant attach, we create
RemoteLayer instances for each of them and add them to the layer map.

On-Demand Download As An Effect Of Page Reconstruction
======================================================

At the heart of pageserver is Timeline::get_reconstruct_data(). It
traverses the layer map until it has collected all the data it needs to
produce the page image. Most code in the code base uses it, though many
layers of indirection.

Before this patch, the function would use synchronous filesystem IO to
load data from disk-resident layer files if the data was not cached.

That is not possible with RemoteLayer, because the layer file has not
been downloaded yet. So, we do the download when get_reconstruct_data
gets there, i.e., “on demand”.

The mechanics of how the download is done are rather involved, because
of the infamous async-sync-async sandwich problem that plagues the async
Rust world. We use the new PageReconstructResult type to work around
this. Its introduction is the cause for a good amount of code churn in
this patch. Refer to the block comment on `with_ondemand_download()`
for details.

Breaking Semantics For Physical Size Metrics
============================================

We rename prometheus metric pageserver_{current,resident}_physical_size to
reflect what this metric actually represents with on-demand download.
This intentionally BREAKS existing grafana dashboard and the cost model data
pipeline. Breaking is desirable because the meaning of this metrics has changed
with on-demand download. See
 https://docs.google.com/document/d/12AFpvKY-7FZdR5a4CaD6Ir_rI3QokdCLSPJ6upHxJBo/edit#
for how we will handle this breakage.

Likewise, we rename the new billing_metrics’s PhysicalSize => ResidentSize.
This is not yet used anywhere, so, this is not a breaking change.

There is still a field called TimelineInfo::current_physical_size. It
is now the sum of the layer sizes in layer map, regardless of whether
local or remote. To compute that sum, we added a new trait method
PersistentLayer::file_size().

When updating the Python tests, we got rid of
current_physical_size_non_incremental. An earlier commit removed it from
the OpenAPI spec already, so this is not a breaking change.

test_timeline_size.py has grown additional assertions on the
resident_physical_size metric.
---
 libs/pageserver_api/src/models.rs             |  23 +-
 pageserver/src/basebackup.rs                  |  65 +-
 pageserver/src/billing_metrics.rs             |  28 +-
 pageserver/src/http/routes.rs                 | 114 ++-
 pageserver/src/import_datadir.rs              |  14 +-
 pageserver/src/lib.rs                         |   2 +-
 pageserver/src/metrics.rs                     |  22 +-
 pageserver/src/page_service.rs                |  30 +-
 pageserver/src/pgdatadir_mapping.rs           | 311 ++++---
 pageserver/src/storage_sync2.rs               |  46 +-
 pageserver/src/storage_sync2/download.rs      |   4 +
 pageserver/src/task_mgr.rs                    |  18 +
 pageserver/src/tenant.rs                      |  62 +-
 pageserver/src/tenant/delta_layer.rs          |  25 +-
 pageserver/src/tenant/image_layer.rs          |  25 +-
 pageserver/src/tenant/remote_layer.rs         | 212 +++++
 pageserver/src/tenant/size.rs                 |   2 -
 pageserver/src/tenant/storage_layer.rs        |  27 +
 pageserver/src/tenant/timeline.rs             | 832 ++++++++++++++----
 pageserver/src/virtual_file.rs                |   6 +-
 pageserver/src/walingest.rs                   | 470 +++++++---
 .../src/walreceiver/connection_manager.rs     |   2 +-
 .../src/walreceiver/walreceiver_connection.rs |  17 +-
 pageserver/src/walrecord.rs                   |   1 +
 scripts/export_import_between_pageservers.py  |  10 +-
 test_runner/fixtures/metrics.py               |   2 +-
 test_runner/fixtures/neon_fixtures.py         | 172 +++-
 test_runner/regress/test_broken_timeline.py   |  11 +-
 test_runner/regress/test_metric_collection.py |   2 +-
 test_runner/regress/test_ondemand_download.py | 437 +++++++++
 test_runner/regress/test_remote_storage.py    |  69 +-
 test_runner/regress/test_tenant_relocation.py |  25 +-
 test_runner/regress/test_tenant_tasks.py      |  34 +-
 .../test_tenants_with_remote_storage.py       |  34 +-
 test_runner/regress/test_timeline_size.py     |  89 +-
 test_runner/regress/test_wal_acceptor.py      |  24 +-
 36 files changed, 2556 insertions(+), 711 deletions(-)
 create mode 100644 pageserver/src/tenant/remote_layer.rs
 create mode 100644 test_runner/regress/test_ondemand_download.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 586ce2a73a..88603d9539 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -163,6 +163,8 @@ pub struct TenantInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub id: TenantId,
     pub state: TenantState,
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub has_in_progress_downloads: Option<bool>,
 }
@@ -191,9 +193,12 @@ pub struct TimelineInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub remote_consistent_lsn: Lsn,
     pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
     pub current_logical_size_non_incremental: Option<u64>,
-    pub current_physical_size_non_incremental: Option<u64>,
+
+    pub timeline_dir_layer_file_size_sum: Option<u64>,
 
     pub wal_source_connstr: Option<String>,
     #[serde_as(as = "Option<DisplayFromStr>")]
@@ -205,6 +210,22 @@ pub struct TimelineInfo {
     pub state: TimelineState,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct DownloadRemoteLayersTaskInfo {
+    pub task_id: String,
+    pub state: DownloadRemoteLayersTaskState,
+    pub total_layer_count: u64,         // stable once `completed`
+    pub successful_download_count: u64, // stable once `completed`
+    pub failed_download_count: u64,     // stable once `completed`
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub enum DownloadRemoteLayersTaskState {
+    Running,
+    Completed,
+    ShutDown,
+}
+
 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
 
 /// Information for configuring a single fail point
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 973c3cd3a6..aa87865a8a 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,7 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;
 
-use crate::tenant::Timeline;
+use crate::task_mgr;
+use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
 use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -152,23 +153,29 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
+            for segno in
+                with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
+            {
                 self.add_slru_segment(kind, segno)?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
+        {
             self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
+                for rel in with_ondemand_download_sync(|| {
+                    self.timeline.list_rels(spcnode, dbnode, self.lsn)
+                })? {
                     self.add_rel(rel)?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn)? {
+        for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
             self.add_twophase_file(xid)?;
         }
 
@@ -185,7 +192,8 @@ where
     }
 
     fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
+        let nblocks =
+            with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
 
         // Function that adds relation segment data to archive
         let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
@@ -208,7 +216,8 @@ where
             for blknum in blocks {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .no_ondemand_download()?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
@@ -222,13 +231,16 @@ where
     // Generate SLRU segment files from repository.
     //
     fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
+        let nblocks = with_ondemand_download_sync(|| {
+            self.timeline.get_slru_segment_size(slru, segno, self.lsn)
+        })?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
+            let img = with_ondemand_download_sync(|| {
+                self.timeline
+                    .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+            })?;
 
             if slru == SlruKind::Clog {
                 ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -260,7 +272,9 @@ where
         has_relmap_file: bool,
     ) -> anyhow::Result<()> {
         let relmap_img = if has_relmap_file {
-            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
+            let img = with_ondemand_download_sync(|| {
+                self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
+            })?;
             ensure!(img.len() == 512);
             Some(img)
         } else {
@@ -295,7 +309,8 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)?
+                    .list_rels(spcnode, dbnode, self.lsn)
+                    .no_ondemand_download()?
                     .is_empty()
             {
                 return Ok(());
@@ -327,7 +342,7 @@ where
     // Extract twophase state files
     //
     fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn)?;
+        let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -361,14 +376,12 @@ where
             zenith_signal.as_bytes(),
         )?;
 
-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn)
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn)
-            .context("failed get control bytes")?;
+        let checkpoint_bytes =
+            with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
+                .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes =
+            with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
+                .context("failed get control bytes")?;
 
         let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
             &pg_control_bytes,
@@ -490,3 +503,11 @@ where
         }
     }
 }
+
+fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
+where
+    F: Send + Fn() -> PageReconstructResult<T>,
+    T: Send,
+{
+    task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
+}
diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
index c5da54b8fc..f9d3e8553f 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -73,10 +73,10 @@ pub enum BillingMetricKind {
     /// This is an absolute, per-tenant metric.
     /// This is the same metric that tenant/tenant_id/size endpoint returns.
     SyntheticStorageSize,
-    /// Size of all the files in the tenant's directory on disk on the pageserver.
+    /// Size of all the layer files in the tenant's directory on disk on the pageserver.
     /// This is an absolute, per-tenant metric.
-    /// See also prometheus metric CURRENT_PHYSICAL_SIZE.
-    PhysicalSize,
+    /// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
+    ResidentSize,
     /// Size of the remote storage (S3) directory.
     /// This is an absolute, per-tenant metric.
     RemoteStorageSize,
@@ -89,7 +89,7 @@ impl FromStr for BillingMetricKind {
         match s {
             "written_size" => Ok(Self::WrittenSize),
             "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
-            "physical_size" => Ok(Self::PhysicalSize),
+            "resident_size" => Ok(Self::ResidentSize),
             "remote_storage_size" => Ok(Self::RemoteStorageSize),
             _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
         }
@@ -101,7 +101,7 @@ impl fmt::Display for BillingMetricKind {
         f.write_str(match self {
             BillingMetricKind::WrittenSize => "written_size",
             BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
-            BillingMetricKind::PhysicalSize => "physical_size",
+            BillingMetricKind::ResidentSize => "resident_size",
             BillingMetricKind::RemoteStorageSize => "remote_storage_size",
         })
     }
@@ -171,7 +171,7 @@ pub async fn collect_metrics_task(
 
         let tenant = tenant_mgr::get_tenant(tenant_id, true).await?;
 
-        let mut tenant_physical_size = 0;
+        let mut tenant_resident_size = 0;
 
         // iterate through list of timelines in tenant
         for timeline in tenant.list_timelines().iter() {
@@ -186,27 +186,27 @@ pub async fn collect_metrics_task(
                 timeline_written_size,
             ));
 
-            let timeline_size = timeline.get_physical_size();
-            tenant_physical_size += timeline_size;
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
 
             debug!(
-                "per-timeline current metrics for tenant: {}: timeline {} physical_size={} last_record_lsn {} (as bytes)",
-                tenant_id, timeline.timeline_id, timeline_size, timeline_written_size)
+                "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
+                tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
         }
 
         let tenant_remote_size = tenant.get_remote_size().await?;
         debug!(
-            "collected current metrics for tenant: {}: state={:?} tenant_physical_size={} remote_size={}",
-            tenant_id, tenant_state, tenant_physical_size, tenant_remote_size
+            "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
+            tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
         );
 
         current_metrics.push((
             BillingMetricsKey {
                 tenant_id,
                 timeline_id: None,
-                metric: BillingMetricKind::PhysicalSize,
+                metric: BillingMetricKind::ResidentSize,
             },
-            tenant_physical_size,
+            tenant_resident_size,
         ));
 
         current_metrics.push((
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 937a6144b6..6d97f3206e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,7 +12,7 @@ use super::models::{
     TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, Timeline};
 use crate::tenant_config::TenantConfOpt;
 use crate::{config::PageServerConf, tenant_mgr};
 use utils::{
@@ -78,25 +78,23 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 }
 
 // Helper function to construct a TimelineInfo struct for a timeline
-fn build_timeline_info(
+async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
-    include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<TimelineInfo> {
     let mut info = build_timeline_info_common(timeline)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
         // we're executing this function, we will outlive the timeline on-disk state.
-        info.current_logical_size_non_incremental =
-            Some(timeline.get_current_logical_size_non_incremental(
-                info.last_record_lsn,
-                CancellationToken::new(),
-            )?);
-    }
-    if include_non_incremental_physical_size {
-        info.current_physical_size_non_incremental =
-            Some(timeline.get_physical_size_non_incremental()?)
+        info.current_logical_size_non_incremental = Some(
+            timeline
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                )
+                .await?,
+        );
     }
     Ok(info)
 }
@@ -128,7 +126,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
             None
         }
     };
-    let current_physical_size = Some(timeline.get_physical_size());
+    let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
     let state = timeline.current_state();
     let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
 
@@ -145,7 +143,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         current_logical_size,
         current_physical_size,
         current_logical_size_non_incremental: None,
-        current_physical_size_non_incremental: None,
+        timeline_dir_layer_file_size_sum: None,
         wal_source_connstr,
         last_received_msg_lsn,
         last_received_msg_ts,
@@ -198,8 +196,6 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let include_non_incremental_logical_size =
         query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
     let response_data = async {
@@ -210,17 +206,16 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
 
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
-            let timeline_info = build_timeline_info(
-                &timeline,
-                include_non_incremental_logical_size,
-                include_non_incremental_physical_size,
-            )
-            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
-            .map_err(ApiError::InternalServerError)?;
+            let timeline_info =
+                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                    .await
+                    .context(
+                        "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
+                    )
+                    .map_err(ApiError::InternalServerError)?;
 
             response_data.push(timeline_info);
         }
-
         Ok(response_data)
     }
     .instrument(info_span!("timeline_list", tenant = %tenant_id))
@@ -264,8 +259,6 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let include_non_incremental_logical_size =
         query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
     let timeline_info = async {
@@ -277,13 +270,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(
-            &timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        )
-        .context("Failed to get local timeline info: {e:#}")
-        .map_err(ApiError::InternalServerError)?;
+        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
+            .await
+            .context("Failed to get local timeline info: {e:#}")
+            .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -308,10 +298,11 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
-    let result = match timeline
-        .find_lsn_for_timestamp(timestamp_pg)
-        .map_err(ApiError::InternalServerError)?
-    {
+    let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let result = match result {
         LsnForTimestamp::Present(lsn) => format!("{lsn}"),
         LsnForTimestamp::Future(_lsn) => "future".into(),
         LsnForTimestamp::Past(_lsn) => "past".into(),
@@ -433,7 +424,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.get_physical_size();
+            current_physical_size += timeline.layer_size_sum().approximate_is_ok();
         }
 
         let state = tenant.current_state();
@@ -786,6 +777,45 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     json_response(StatusCode::OK, ())
 }
 
+async fn timeline_download_remote_layers_handler_post(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    match timeline.spawn_download_all_remote_layers().await {
+        Ok(st) => json_response(StatusCode::ACCEPTED, st),
+        Err(st) => json_response(StatusCode::CONFLICT, st),
+    }
+}
+
+async fn timeline_download_remote_layers_handler_get(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    let info = timeline
+        .get_download_all_remote_layers_task_info()
+        .context("task never started since last pageserver process start")
+        .map_err(ApiError::NotFound)?;
+    json_response(StatusCode::OK, info)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -870,6 +900,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
             testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_post,
+        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_get,
+        )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id",
             timeline_delete_handler,
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index db83bdb3a1..1684ca3c64 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -187,13 +187,13 @@ fn import_slru<Reader: Read>(
     path: &Path,
     mut reader: Reader,
     len: usize,
-) -> Result<()> {
-    trace!("importing slru file {}", path.display());
+) -> anyhow::Result<()> {
+    info!("importing slru file {path:?}");
 
     let mut buf: [u8; 8192] = [0u8; 8192];
     let filename = &path
         .file_name()
-        .expect("missing slru filename")
+        .with_context(|| format!("missing slru filename for path {path:?}"))?
         .to_string_lossy();
     let segno = u32::from_str_radix(filename, 16)?;
 
@@ -279,7 +279,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .no_ondemand_download()?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -405,7 +407,9 @@ pub fn import_wal_from_tar<Reader: Read>(
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .no_ondemand_download()?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 626d5e99e3..e01eb12b7b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -91,7 +91,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
     }
 }
 
-fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
     if n == 0 {
         0.0
     } else {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 308f9cd4eb..205ee0ffad 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,13 +84,10 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-// Metrics for determining timeline's physical size.
-// A layered timeline's physical is defined as the total size of
-// (delta/image) layer files on disk.
-static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
-        "pageserver_current_physical_size",
-        "Current physical size grouped by timeline",
+        "pageserver_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
         &["tenant_id", "timeline_id"]
     )
     .expect("failed to define a metric")
@@ -146,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
     1.0,      // 1 sec
 ];
 
-const STORAGE_IO_TIME_OPERATIONS: &[&str] =
-    &["open", "close", "read", "write", "seek", "fsync", "gc"];
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
 
 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
 
@@ -375,7 +373,7 @@ pub struct TimelineMetrics {
     pub load_layer_map_histo: Histogram,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
-    pub current_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -416,7 +414,7 @@ impl TimelineMetrics {
         let wait_lsn_time_histo = WAIT_LSN_TIME
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
+        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
@@ -442,7 +440,7 @@ impl TimelineMetrics {
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
-            current_physical_size_gauge,
+            resident_physical_size_gauge,
             current_logical_size_gauge,
             num_persistent_files_created,
             persistent_bytes_written,
@@ -458,7 +456,7 @@ impl Drop for TimelineMetrics {
         let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
         let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d9c19d04b7..fd4353a421 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -541,7 +541,10 @@ impl PageServerHandler {
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
+        let exists = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_exists(req.rel, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
@@ -558,7 +561,10 @@ impl PageServerHandler {
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
+        let n_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_size(req.rel, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
@@ -575,9 +581,10 @@ impl PageServerHandler {
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let total_blocks =
-            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
-
+        let total_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+        })
+        .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
         Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
@@ -603,11 +610,14 @@ impl PageServerHandler {
         }
         */
 
-        // FIXME: this profiling now happens at different place than it used to. The
-        // current profiling is based on a thread-local variable, so it doesn't work
-        // across awaits
-        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
-        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
+        let page = crate::tenant::with_ondemand_download(|| {
+            // FIXME: this profiling now happens at different place than it used to. The
+            // current profiling is based on a thread-local variable, so it doesn't work
+            // across awaits
+            let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
+            timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
             page,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7b4b05ed18..77910bceda 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,11 +6,12 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
+use super::tenant::PageReconstructResult;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{self, bail, ensure, Context};
+use crate::{repository::*, try_no_ondemand_download};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -97,16 +98,18 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
-    ) -> anyhow::Result<Bytes> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    ) -> PageReconstructResult<Bytes> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest)?;
+        let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                 tag, blknum, lsn, nblocks
             );
-            return Ok(ZERO_PAGE.clone());
+            return PageReconstructResult::Success(ZERO_PAGE.clone());
         }
 
         let key = rel_block_to_key(tag, blknum);
@@ -120,38 +123,45 @@ impl Timeline {
         dbnode: Oid,
         lsn: Lsn,
         latest: bool,
-    ) -> anyhow::Result<usize> {
+    ) -> PageReconstructResult<usize> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn)?;
+        let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest)?;
+            let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
             total_blocks += n_blocks as usize;
         }
-        Ok(total_blocks)
+        PageReconstructResult::Success(total_blocks)
     }
 
     /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> anyhow::Result<BlockNumber> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_size(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        latest: bool,
+    ) -> PageReconstructResult<BlockNumber> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
         if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(nblocks);
+            return PageReconstructResult::Success(nblocks);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest)?
+            && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
             // without extending it.  Tolerate that by claiming that
             // any non-existent FSM fork has size 0.
-            return Ok(0);
+            return PageReconstructResult::Success(0);
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn)?;
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -164,25 +174,35 @@ impl Timeline {
             // associated with most recent value of LSN.
             self.update_cached_rel_size(tag, lsn, nblocks);
         }
-        Ok(nblocks)
+        PageReconstructResult::Success(nblocks)
     }
 
     /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> anyhow::Result<bool> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_exists(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        _latest: bool,
+    ) -> PageReconstructResult<bool> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
         // first try to lookup relation in cache
         if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(true);
+            return PageReconstructResult::Success(true);
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
-
-        Ok(exists)
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Get a list of all existing relations in given tablespace and database.
@@ -191,21 +211,25 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
-    ) -> anyhow::Result<HashSet<RelTag>> {
+    ) -> PageReconstructResult<HashSet<RelTag>> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let rels: HashSet<RelTag> =
-            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
-                spcnode,
-                dbnode,
-                relnode: *relnode,
-                forknum: *forknum,
-            }));
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let rels: HashSet<RelTag> =
+                    HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
+                        spcnode,
+                        dbnode,
+                        relnode: *relnode,
+                        forknum: *forknum,
+                    }));
 
-        Ok(rels)
+                PageReconstructResult::Success(rels)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Look up given SLRU page version.
@@ -215,7 +239,7 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
-    ) -> anyhow::Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn)
     }
@@ -226,10 +250,10 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> anyhow::Result<BlockNumber> {
+    ) -> PageReconstructResult<BlockNumber> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn)?;
-        Ok(buf.get_u32_le())
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
@@ -238,14 +262,18 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> anyhow::Result<bool> {
+    ) -> PageReconstructResult<bool> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let exists = dir.segments.get(&segno).is_some();
-        Ok(exists)
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.segments.get(&segno).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Locate LSN, such that all transactions that committed before
@@ -258,7 +286,7 @@ impl Timeline {
     pub fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
-    ) -> anyhow::Result<LsnForTimestamp> {
+    ) -> PageReconstructResult<LsnForTimestamp> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
         let max_lsn = self.get_last_record_lsn();
@@ -274,12 +302,12 @@ impl Timeline {
             // cannot overflow, high and low are both smaller than u64::MAX / 2
             let mid = (high + low) / 2;
 
-            let cmp = self.is_latest_commit_timestamp_ge_than(
+            let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
                 search_timestamp,
                 Lsn(mid * 8),
                 &mut found_smaller,
                 &mut found_larger,
-            )?;
+            ));
 
             if cmp {
                 high = mid;
@@ -291,15 +319,15 @@ impl Timeline {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.
                 // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
             }
             (true, false) => {
                 // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
             }
             (false, true) => {
                 // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
             }
             (true, true) => {
                 // low is the LSN of the first commit record *after* the search_timestamp,
@@ -309,7 +337,7 @@ impl Timeline {
                 // Otherwise, if you restore to the returned LSN, the database will
                 // include physical changes from later commits that will be marked
                 // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+                PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
             }
         }
     }
@@ -327,12 +355,20 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
-    ) -> anyhow::Result<bool> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
-            let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
+    ) -> PageReconstructResult<bool> {
+        for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
+            let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
+                SlruKind::Clog,
+                segno,
+                probe_lsn
+            ));
             for blknum in (0..nblocks).rev() {
-                let clog_page =
-                    self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
+                let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
+                    SlruKind::Clog,
+                    segno,
+                    blknum,
+                    probe_lsn
+                ));
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
                     let mut timestamp_bytes = [0u8; 8];
@@ -341,61 +377,75 @@ impl Timeline {
 
                     if timestamp >= search_timestamp {
                         *found_larger = true;
-                        return Ok(true);
+                        return PageReconstructResult::Success(true);
                     } else {
                         *found_smaller = true;
                     }
                 }
             }
         }
-        Ok(false)
+        PageReconstructResult::Success(false)
     }
 
     /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> anyhow::Result<HashSet<u32>> {
+    pub fn list_slru_segments(
+        &self,
+        kind: SlruKind,
+        lsn: Lsn,
+    ) -> PageReconstructResult<HashSet<u32>> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
-
-        Ok(dir.segments)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.segments),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_relmap_file(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> PageReconstructResult<Bytes> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
     }
 
-    pub fn list_dbdirs(&self, lsn: Lsn) -> anyhow::Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dir = DbDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
 
-        Ok(dir.dbdirs)
+        match DbDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
     }
 
-    pub fn list_twophase_files(&self, lsn: Lsn) -> anyhow::Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
 
-        Ok(dir.xids)
+        match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.xids),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_control_file(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
         self.get(CONTROLFILE_KEY, lsn)
     }
 
-    pub fn get_checkpoint(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
         self.get(CHECKPOINT_KEY, lsn)
     }
 
@@ -404,23 +454,26 @@ impl Timeline {
     ///
     /// Only relation blocks are counted currently. That excludes metadata,
     /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(
+    pub async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
         cancel: CancellationToken,
-    ) -> std::result::Result<u64, CalculateLogicalSizeError> {
+    ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
+            for rel in
+                crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
+                    .await?
+            {
                 if cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -433,7 +486,7 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
+    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -441,8 +494,8 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
+        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
         dbs.sort_unstable();
@@ -451,14 +504,15 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)?
+                .list_rels(spcnode, dbnode, lsn)
+                .no_ondemand_download()?
                 .iter()
                 .cloned()
                 .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -474,13 +528,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn)?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
+            let buf = self.get_download(slrudir_key, lsn).await?;
+            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn)?;
+                let mut buf = self.get_download(segsize_key, lsn).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -492,8 +546,8 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
         for xid in xids {
@@ -606,7 +660,7 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
     }
@@ -633,7 +687,7 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
     }
@@ -652,7 +706,7 @@ impl<'a> DatadirModification<'a> {
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -680,10 +734,10 @@ impl<'a> DatadirModification<'a> {
 
     pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
-            bail!("twophase file for xid {} already exists", xid);
+            anyhow::bail!("twophase file for xid {} already exists", xid);
         }
         self.put(
             TWOPHASEDIR_KEY,
@@ -707,10 +761,13 @@ impl<'a> DatadirModification<'a> {
     pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
-        let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
+        let total_blocks = self
+            .tline
+            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .no_ondemand_download()?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -734,10 +791,10 @@ impl<'a> DatadirModification<'a> {
     ///
     /// 'nblocks' is the initial size.
     pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -749,12 +806,12 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key)?)?
+            RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            bail!("rel {} already exists", rel);
+            anyhow::bail!("rel {rel} already exists");
         }
         self.put(
             rel_dir_key,
@@ -778,12 +835,16 @@ impl<'a> DatadirModification<'a> {
 
     /// Truncate relation
     pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true)? {
+        if self
+            .tline
+            .get_rel_exists(rel, last_lsn, true)
+            .no_ondemand_download()?
+        {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key)?.get_u32_le();
+            let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -804,11 +865,11 @@ impl<'a> DatadirModification<'a> {
     /// Extend relation
     /// If new size is smaller, do nothing.
     pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -825,11 +886,11 @@ impl<'a> DatadirModification<'a> {
 
     /// Drop a relation.
     pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -840,7 +901,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -860,11 +921,11 @@ impl<'a> DatadirModification<'a> {
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
-            bail!("slru segment {:?}/{} already exists", kind, segno);
+            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
         self.put(
             dir_key,
@@ -899,7 +960,7 @@ impl<'a> DatadirModification<'a> {
     pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -925,7 +986,7 @@ impl<'a> DatadirModification<'a> {
     /// This method is used for marking truncated SLRU files
     pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -1019,7 +1080,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    fn get(&self, key: Key) -> anyhow::Result<Bytes> {
+    fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1027,14 +1088,16 @@ impl<'a> DatadirModification<'a> {
         // value that has been removed, deletion only avoids leaking storage.
         if let Some(value) = self.pending_updates.get(&key) {
             if let Value::Image(img) = value {
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
             } else {
                 // Currently, we never need to read back a WAL record that we
                 // inserted in the same "transaction". All the metadata updates
                 // work directly with Images, and we never need to read actual
                 // data pages. We could handle this if we had to, by calling
                 // the walredo manager, but let's keep it simple for now.
-                bail!("unexpected pending WAL record");
+                return PageReconstructResult::from(anyhow::anyhow!(
+                    "unexpected pending WAL record"
+                ));
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1400,7 +1463,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
             },
             key.field6,
         ),
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
 
@@ -1426,14 +1489,14 @@ pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber
                 0x00 => SlruKind::Clog,
                 0x01 => SlruKind::MultiXactMembers,
                 0x02 => SlruKind::MultiXactOffsets,
-                _ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
             };
             let segno = key.field4;
             let blknum = key.field6;
 
             (kind, segno, blknum)
         }
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
 
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 9253b250cd..a2337e8fd6 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -148,31 +148,43 @@
 //! following two cases:
 //! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
 //!   but crashed before it finished remotely.
-//! - (2) We never had the file locally because we were still in tenant attach
-//!   when we crashed. (Similar case for on-demand download in the future.)
+//! - (2) We never had the file locally because we haven't on-demand downloaded
+//!   it yet.
 //!
-//! # Downloads (= Tenant Attach)
+//! # Downloads
 //!
 //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
-//! downloading files from the remote storage. Downloads are performed immediately,
-//! independently of the uploads.
+//! downloading files from the remote storage. Downloads are performed immediately
+//! against the `RemoteStorage`, independently of the upload queue.
 //!
 //! When we attach a tenant, we perform the following steps:
 //! - create `Tenant` object in `TenantState::Attaching` state
-//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
-//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
-//! - eagerly download all the remote layers using the client's download APIs
-//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
+//! - List timelines that are present in remote storage, and for each:
+//!   - download their remote [`IndexPart`]s
+//!   - create `Timeline` struct and a `RemoteTimelineClient`
+//!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
+//!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
+//! - After the above is done for each timeline, open the tenant for business by
+//!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
+//!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above happens in [`Timeline::reconcile_with_remote`].
+//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
-//! file on the local disk.
-//! However, the distinction is moot for storage sync since we call
-//! `reconcile_with_remote` for tenants both with and without the marker file.
-//!
-//! In the future, downloading will be done on-demand and `reconcile_with_remote`
-//! will only be responsible for re-scheduling upload ops after a crash of an
-//! `Active` tenant.
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
 //!
 //! # Operating Without Remote Storage
 //!
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index c81be05981..4256767020 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -180,6 +180,10 @@ pub async fn list_remote_timelines<'a>(
     let tenant_path = conf.timelines_path(&tenant_id);
     let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
     let timelines = download_retry(
         || storage.list_prefixes(Some(&tenant_storage_path)),
         &format!("list prefixes for {tenant_path:?}"),
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index fe3ad1a57d..a1b3ad26b0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -35,6 +35,7 @@
 #![allow(clippy::declare_interior_mutable_const)]
 
 use std::collections::HashMap;
+use std::fmt;
 use std::future::Future;
 use std::panic::AssertUnwindSafe;
 use std::sync::atomic::{AtomicU64, Ordering};
@@ -134,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create background op runtime")
 });
 
+#[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
 
+impl fmt::Display for PageserverTaskId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Each task that we track is associated with a "task ID". It's just an
 /// increasing number that we assign. Note that it is different from tokio::task::Id.
 static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
@@ -198,6 +206,9 @@ pub enum TaskKind {
     // Task that uploads a file to remote storage
     RemoteUploadTask,
 
+    // Task that downloads a file from remote storage
+    RemoteDownloadTask,
+
     // task that handles the initial downloading of all tenants
     InitialLoad,
 
@@ -206,6 +217,9 @@ pub enum TaskKind {
 
     // task that handhes metrics collection
     MetricsCollection,
+
+    // task that drives downloading layers
+    DownloadAllRemoteLayers,
 }
 
 #[derive(Default)]
@@ -437,6 +451,10 @@ pub fn current_task_kind() -> Option<TaskKind> {
     CURRENT_TASK.try_with(|ct| ct.kind).ok()
 }
 
+pub fn current_task_id() -> Option<PageserverTaskId> {
+    CURRENT_TASK.try_with(|ct| ct.task_id).ok()
+}
+
 /// A Future that can be used to check if the current task has been requested to
 /// shut down.
 pub async fn shutdown_watcher() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 799a34fb3b..1240a3b4fb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -81,6 +81,7 @@ pub mod filename;
 mod image_layer;
 mod inmemory_layer;
 pub mod layer_map;
+mod remote_layer;
 
 pub mod metadata;
 mod par_fsync;
@@ -90,7 +91,7 @@ mod timeline;
 
 pub mod size;
 
-pub use timeline::Timeline;
+pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline};
 
 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -2780,9 +2781,18 @@ mod tests {
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
 
         Ok(())
     }
@@ -2859,15 +2869,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?,
             "foobar at 0x20"
         );
 
@@ -3026,7 +3036,10 @@ mod tests {
         tenant
             .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
+        assert!(newtline
+            .get(*TEST_KEY, Lsn(0x25))
+            .no_ondemand_download()
+            .is_ok());
 
         Ok(())
     }
@@ -3056,7 +3069,7 @@ mod tests {
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50))?,
+            newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3203,11 +3216,26 @@ mod tests {
         tline.freeze_and_flush().await?;
         tline.compact().await?;
 
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x30")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x40")
+        );
 
         Ok(())
     }
@@ -3315,7 +3343,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3401,7 +3429,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3476,7 +3504,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn)?,
+                    tline.get(test_key, *lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index e1006dfe00..5b724b6263 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -183,6 +183,8 @@ pub struct DeltaLayer {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 
+    pub file_size: u64,
+
     inner: RwLock<DeltaLayerInner>,
 }
 
@@ -411,6 +413,10 @@ impl PersistentLayer for DeltaLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }
 
 impl DeltaLayer {
@@ -535,6 +541,7 @@ impl DeltaLayer {
         timeline_id: TimelineId,
         tenant_id: TenantId,
         filename: &DeltaFileName,
+        file_size: u64,
     ) -> DeltaLayer {
         DeltaLayer {
             path_or_conf: PathOrConf::Conf(conf),
@@ -542,6 +549,7 @@ impl DeltaLayer {
             tenant_id,
             key_range: filename.key_range.clone(),
             lsn_range: filename.lsn_range.clone(),
+            file_size,
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -554,21 +562,23 @@ impl DeltaLayer {
     /// Create a DeltaLayer struct representing an existing file on disk.
     ///
     /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
-    where
-        F: FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
         file.read_exact_at(&mut summary_buf, 0)?;
         let summary = Summary::des_prefix(&summary_buf)?;
 
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
         Ok(DeltaLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
             timeline_id: summary.timeline_id,
             tenant_id: summary.tenant_id,
             key_range: summary.key_range,
             lsn_range: summary.lsn_range,
+            file_size: metadata.len(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -725,6 +735,10 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(0))?;
         Summary::ser_into(&summary, &mut file)?;
 
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
         // Note: Because we opened the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -734,6 +748,7 @@ impl DeltaLayerWriterInner {
             timeline_id: self.timeline_id,
             key_range: self.key_start..key_end,
             lsn_range: self.lsn_range.clone(),
+            file_size: metadata.len(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index b1dbbfb683..1e129fc01d 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -36,10 +36,11 @@ use bytes::Bytes;
 use hex;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
+use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;
@@ -105,6 +106,7 @@ pub struct ImageLayer {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     pub key_range: Range<Key>,
+    pub file_size: u64,
 
     // This entry contains an image of all pages as of this LSN
     pub lsn: Lsn,
@@ -228,6 +230,10 @@ impl PersistentLayer for ImageLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }
 
 impl ImageLayer {
@@ -344,6 +350,7 @@ impl ImageLayer {
         timeline_id: TimelineId,
         tenant_id: TenantId,
         filename: &ImageFileName,
+        file_size: u64,
     ) -> ImageLayer {
         ImageLayer {
             path_or_conf: PathOrConf::Conf(conf),
@@ -351,6 +358,7 @@ impl ImageLayer {
             tenant_id,
             key_range: filename.key_range.clone(),
             lsn: filename.lsn,
+            file_size,
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
@@ -363,21 +371,21 @@ impl ImageLayer {
     /// Create an ImageLayer struct representing an existing file on disk.
     ///
     /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
-    where
-        F: std::os::unix::prelude::FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
         file.read_exact_at(&mut summary_buf, 0)?;
         let summary = Summary::des_prefix(&summary_buf)?;
-
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
         Ok(ImageLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
             timeline_id: summary.timeline_id,
             tenant_id: summary.tenant_id,
             key_range: summary.key_range,
             lsn: summary.lsn,
+            file_size: metadata.len(),
             inner: RwLock::new(ImageLayerInner {
                 file: None,
                 loaded: false,
@@ -523,6 +531,10 @@ impl ImageLayerWriterInner {
         file.seek(SeekFrom::Start(0))?;
         Summary::ser_into(&summary, &mut file)?;
 
+        let metadata = file
+            .metadata()
+            .context("get metadata to determine file size")?;
+
         // Note: Because we open the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -532,6 +544,7 @@ impl ImageLayerWriterInner {
             tenant_id: self.tenant_id,
             key_range: self.key_range.clone(),
             lsn: self.lsn,
+            file_size: metadata.len(),
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/remote_layer.rs b/pageserver/src/tenant/remote_layer.rs
new file mode 100644
index 0000000000..affe8ca0a8
--- /dev/null
+++ b/pageserver/src/tenant/remote_layer.rs
@@ -0,0 +1,212 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::repository::Key;
+use crate::storage_sync::index::LayerFileMetadata;
+use crate::tenant::delta_layer::DeltaLayer;
+use crate::tenant::filename::{DeltaFileName, ImageFileName};
+use crate::tenant::image_layer::ImageLayer;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use anyhow::{bail, Result};
+use std::ops::Range;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::LayerFileName;
+use super::storage_layer::{LayerIter, LayerKeyIter, PersistentLayer};
+
+#[derive(Debug)]
+pub struct RemoteLayer {
+    tenantid: TenantId,
+    timelineid: TimelineId,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
+
+    pub file_name: LayerFileName,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    is_delta: bool,
+
+    is_incremental: bool,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+}
+
+impl Layer for RemoteLayer {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    /// debugging function to print out the contents of the layer
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        println!(
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenantid,
+            self.timelineid,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        Ok(())
+    }
+
+    fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn get_tenant_id(&self) -> TenantId {
+        self.tenantid
+    }
+
+    fn get_timeline_id(&self) -> TimelineId {
+        self.timelineid
+    }
+
+    fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            }
+            .into()
+        } else {
+            ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            }
+            .into()
+        }
+    }
+
+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn delete(&self) -> Result<()> {
+        Ok(())
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn file_size(&self) -> Option<u64> {
+        self.layer_metadata.file_size()
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn..(fname.lsn + 1),
+            is_delta: false,
+            is_incremental: false,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn_range.clone(),
+            is_delta: true,
+            is_incremental: true,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub fn create_downloaded_layer(
+        &self,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.is_delta {
+            let fname = DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            };
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        } else {
+            let fname = ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            };
+            Arc::new(ImageLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        }
+    }
+}
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 5ce0837562..aa11985cbe 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -97,8 +97,6 @@ pub(super) async fn gather_inputs(
     // used to determine the `retention_period` for the size model
     let mut max_cutoff_distance = None;
 
-    // this will probably conflict with on-demand downloaded layers, or at least force them all
-    // to be downloaded
     for timeline in timelines {
         let last_record_lsn = timeline.get_last_record_lsn();
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 79eaa96591..8bfac5df8e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,6 +8,7 @@ use anyhow::Result;
 use bytes::Bytes;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::sync::Arc;
 
 use utils::{
     id::{TenantId, TimelineId},
@@ -15,6 +16,8 @@ use utils::{
 };
 
 use super::filename::LayerFileName;
+use super::remote_layer::RemoteLayer;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -161,4 +164,28 @@ pub trait PersistentLayer: Layer {
 
     /// Permanently remove this layer from disk.
     fn delete(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    /// Returns None if the layer file size is not known.
+    ///
+    /// Should not change over the lifetime of the layer object because
+    /// current_physical_size is computed as the som of this value.
+    fn file_size(&self) -> Option<u64>;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 61d619a17b..f4288fea36 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,11 +3,14 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pageserver_api::models::TimelineState;
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState,
+};
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
-use tokio::task::spawn_blocking;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
@@ -22,6 +25,7 @@ use std::time::{Duration, Instant, SystemTime};
 
 use crate::storage_sync::index::IndexPart;
 use crate::storage_sync::RemoteTimelineClient;
+use crate::tenant::remote_layer::RemoteLayer;
 use crate::tenant::{
     delta_layer::{DeltaLayer, DeltaLayerWriter},
     ephemeral_file::is_ephemeral_file,
@@ -76,7 +80,7 @@ pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
 
-    _myself: Weak<Self>,
+    myself: Weak<Self>,
 
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
@@ -93,10 +97,7 @@ pub struct Timeline {
     walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
 
     /// Remote storage client.
-    ///
-    /// If Some, use it to upload all newly created layers to the remote storage,
-    /// and keep remote metadata file in sync. In the future, also use it to download
-    /// layer files on-demand.
+    /// See [`storage_sync2`] module comment for details.
     pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     // What page versions do we hold in the repository? If we get a
@@ -187,6 +188,8 @@ pub struct Timeline {
     /// Relation size cache
     pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
 
+    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
+
     state: watch::Sender<TimelineState>,
 }
 
@@ -308,12 +311,68 @@ impl LogicalSize {
     }
 }
 
+/// Returned by [`Timeline::layer_size_sum`]
+pub enum LayerSizeSum {
+    /// The result is accurate.
+    Accurate(u64),
+    // We don't know the layer file size of one or more layers.
+    // They contribute to the sum with a value of 0.
+    // Hence, the sum is a lower bound for the actualy layer file size sum.
+    ApproximateLowerBound(u64),
+}
+
+impl LayerSizeSum {
+    pub fn approximate_is_ok(self) -> u64 {
+        match self {
+            LayerSizeSum::Accurate(v) => v,
+            LayerSizeSum::ApproximateLowerBound(v) => v,
+        }
+    }
+}
+
 pub struct WalReceiverInfo {
     pub wal_source_connconf: PgConnectionConfig,
     pub last_received_msg_lsn: Lsn,
     pub last_received_msg_ts: u128,
 }
 
+/// Like `?`, but for [`PageReconstructResult`].
+/// Use it to bubble up the `NeedsDownload` and `Error` to the caller.
+///
+/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
+#[macro_export]
+macro_rules! try_no_ondemand_download {
+    ($result:expr) => {{
+        let result = $result;
+        match result {
+            PageReconstructResult::Success(value) => value,
+            PageReconstructResult::NeedsDownload(timeline, layer) => {
+                return PageReconstructResult::NeedsDownload(timeline, layer);
+            }
+            PageReconstructResult::Error(e) => return PageReconstructResult::Error(e),
+        }
+    }};
+}
+
+/// Replacement for `?` in functions that return [`PageReconstructResult`].
+///
+/// Given an `expr: Result<T, E>`, use `try_page_reconstruct_result!(expr)`
+/// instead of `(expr)?`.
+/// If `expr` is `Ok(v)`, the macro evaluates to `v`.
+/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`.
+///
+/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
+#[macro_export]
+macro_rules! try_page_reconstruct_result {
+    ($result:expr) => {{
+        let result = $result;
+        match result {
+            Ok(v) => v,
+            Err(e) => return PageReconstructResult::from(e),
+        }
+    }};
+}
+
 ///
 /// Information about how much history needs to be retained, needed by
 /// Garbage Collection.
@@ -343,6 +402,77 @@ pub struct GcInfo {
     pub pitr_cutoff: Lsn,
 }
 
+pub enum PageReconstructResult<T> {
+    Success(T),
+    /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map
+    /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then
+    /// retry the operation that returned this error.
+    NeedsDownload(Weak<Timeline>, Weak<RemoteLayer>),
+    Error(PageReconstructError),
+}
+
+/// An error happened in a get() operation.
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+
+    #[error(transparent)]
+    WalRedo(#[from] crate::walredo::WalRedoError),
+}
+
+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+/// This impl makes it so you can substitute return type
+/// `Result<T, E>` with `PageReconstructError<T>` in functions
+/// and existing `?` will generally continue to work.
+/// The reason why  thanks to
+/// anyhow::Error that `(some error type)ensures that exis
+impl<E, T> From<E> for PageReconstructResult<T>
+where
+    E: Into<PageReconstructError>,
+{
+    fn from(e: E) -> Self {
+        Self::Error(e.into())
+    }
+}
+
+impl<T> PageReconstructResult<T> {
+    /// Treat the need for on-demand download as an error.
+    ///
+    /// **Avoid this function in new code** if you can help it,
+    /// as on-demand download will become the norm in the future,
+    /// especially once we implement layer file eviction.
+    ///
+    /// If you are in an async function, use [`with_ondemand_download`]
+    /// to do the download right here.
+    ///
+    /// If you are in a sync function, change its return type from
+    /// `Result<T, E>` to `PageReconstructResult<T>` and bubble up
+    /// the non-success cases of `PageReconstructResult<T>` to the caller.
+    /// This gives them a chance to do the download and retry.
+    /// Consider using [`try_no_ondemand_download`] for convenience.
+    ///
+    /// For more background, read the comment on [`with_ondemand_download`].
+    pub fn no_ondemand_download(self) -> anyhow::Result<T> {
+        match self {
+            PageReconstructResult::Success(value) => Ok(value),
+            // TODO print more info about the timeline
+            PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"),
+            PageReconstructResult::Error(e) => {
+                Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
+            }
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -370,8 +500,10 @@ impl Timeline {
     /// the Repository implementation may incorrectly return a value from an ancestor
     /// branch, for example, or waste a lot of cycles chasing the non-existing key.
     ///
-    pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
-        anyhow::ensure!(lsn.is_valid(), "Invalid LSN");
+    pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult<Bytes> {
+        if !lsn.is_valid() {
+            return PageReconstructResult::from(anyhow!("Invalid LSN"));
+        }
 
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
@@ -381,7 +513,7 @@ impl Timeline {
             Some((cached_lsn, cached_img)) => {
                 match cached_lsn.cmp(&lsn) {
                     Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image
                     Ordering::Greater => {
                         unreachable!("the returned lsn should never be after the requested lsn")
                     }
@@ -396,13 +528,18 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?;
+        try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state));
 
         self.metrics
             .reconstruct_time_histo
             .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
     }
 
+    // Like get(), but if a remote layer file is needed, it is downloaded as part of this call.
+    pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
+        with_ondemand_download(|| self.get(key, lsn)).await
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -429,30 +566,27 @@ impl Timeline {
         }
     }
 
-    /// Get the physical size of the timeline at the latest LSN
-    pub fn get_physical_size(&self) -> u64 {
-        self.metrics.current_physical_size_gauge.get()
+    /// The sum of the file size of all historic layers in the layer map.
+    /// This method makes no distinction between local and remote layers.
+    /// Hence, the result **does not represent local filesystem usage**.
+    pub fn layer_size_sum(&self) -> LayerSizeSum {
+        let layer_map = self.layers.read().unwrap();
+        let mut size = 0;
+        let mut no_size_cnt = 0;
+        for l in layer_map.iter_historic_layers() {
+            let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
+            size += l_size;
+            no_size_cnt += l_no_size;
+        }
+        if no_size_cnt == 0 {
+            LayerSizeSum::Accurate(size)
+        } else {
+            LayerSizeSum::ApproximateLowerBound(size)
+        }
     }
 
-    /// Get the physical size of the timeline at the latest LSN non incrementally
-    pub fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
-        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
-        // total size of layer files in the current timeline directory
-        let mut total_physical_size = 0;
-
-        for direntry in fs::read_dir(timeline_path)? {
-            let direntry = direntry?;
-            let fname = direntry.file_name();
-            let fname = fname.to_string_lossy();
-
-            if ImageFileName::parse_str(&fname).is_some()
-                || DeltaFileName::parse_str(&fname).is_some()
-            {
-                total_physical_size += direntry.metadata()?.len();
-            }
-        }
-
-        Ok(total_physical_size)
+    pub fn get_resident_physical_size(&self) -> u64 {
+        self.metrics.resident_physical_size_gauge.get()
     }
 
     ///
@@ -560,14 +694,18 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        match self.repartition(
-            self.get_last_record_lsn(),
-            self.get_compaction_target_size(),
-        ) {
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+            )
+            .await
+        {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
+                let layer_paths_to_upload =
+                    self.create_image_layers(&partitioning, lsn, false).await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -761,7 +899,7 @@ impl Timeline {
             let mut result = Timeline {
                 conf,
                 tenant_conf,
-                _myself: myself.clone(),
+                myself: myself.clone(),
                 timeline_id,
                 tenant_id,
                 pg_version,
@@ -817,6 +955,9 @@ impl Timeline {
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(HashMap::new()),
+
+                download_all_remote_layers_task_info: RwLock::new(None),
+
                 state,
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
@@ -935,11 +1076,18 @@ impl Timeline {
                     continue;
                 }
 
-                let layer =
-                    ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
+                let file_size = direntry_path.metadata()?.len();
+
+                let layer = ImageLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &imgfilename,
+                    file_size,
+                );
 
                 trace!("found layer {}", layer.path().display());
-                total_physical_size += layer.path().metadata()?.len();
+                total_physical_size += file_size;
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
@@ -959,11 +1107,18 @@ impl Timeline {
                     continue;
                 }
 
-                let layer =
-                    DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
+                let file_size = direntry_path.metadata()?.len();
+
+                let layer = DeltaLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &deltafilename,
+                    file_size,
+                );
 
                 trace!("found layer {}", layer.path().display());
-                total_physical_size += layer.path().metadata()?.len();
+                total_physical_size += file_size;
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
@@ -997,7 +1152,7 @@ impl Timeline {
             num_layers, disk_consistent_lsn, total_physical_size
         );
         self.metrics
-            .current_physical_size_gauge
+            .resident_physical_size_gauge
             .set(total_physical_size);
 
         timer.stop_and_record();
@@ -1005,21 +1160,14 @@ impl Timeline {
         Ok(())
     }
 
-    async fn download_missing(
+    async fn create_remote_layers(
         &self,
         index_part: &IndexPart,
-        remote_client: &RemoteTimelineClient,
         local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
         up_to_date_disk_consistent_lsn: Lsn,
     ) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
         // Are we missing some files that are present in remote storage?
-        // Download them now.
-        // TODO Downloading many files this way is not efficient.
-        //     Better to use FuturesUnordered. Maybe keep as is because:
-        //    a) inplace download is a throw-away code, on-demand patch doesnt need that
-        //    b) typical case now is that there is nothing to sync, this downloads a lot
-        //       1) if there was another pageserver that came and generated new files
-        //       2) during attach of a timeline with big history which we currently do not do
+        // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1033,7 +1181,7 @@ impl Timeline {
             // Is the local layer's size different from the size stored in the
             // remote index file?
             // If so, rename_to_backup those files & replace their local layer with
-            // a RemoteLayer in the laye rmap so that we re-download them on-demand.
+            // a RemoteLayer in the layer map so that we re-download them on-demand.
             if let Some(local_layer) = local_layer {
                 let local_layer_path = local_layer
                     .local_path()
@@ -1058,7 +1206,7 @@ impl Timeline {
                             assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
-                            self.metrics.current_physical_size_gauge.sub(local_size);
+                            self.metrics.resident_physical_size_gauge.sub(local_size);
                             self.layers.write().unwrap().remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
@@ -1079,7 +1227,7 @@ impl Timeline {
             }
 
             info!(
-                "remote layer does not exist locally, downloading it now: {}",
+                "remote layer does not exist locally, creating remote layer: {}",
                 remote_layer_name.file_name()
             );
 
@@ -1093,28 +1241,18 @@ impl Timeline {
                         continue;
                     }
 
-                    trace!("downloading image file: {remote_layer_name:?}");
-                    let downloaded_size = remote_client
-                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
-                        .await
-                        .with_context(|| {
-                            format!("failed to download image layer {remote_layer_name:?}")
-                        })?;
-                    trace!("done");
+                    let remote_layer = RemoteLayer::new_img(
+                        self.tenant_id,
+                        self.timeline_id,
+                        imgfilename,
+                        &remote_layer_metadata,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
 
-                    let image_layer =
-                        ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename);
-
-                    self.layers
-                        .write()
-                        .unwrap()
-                        .insert_historic(Arc::new(image_layer));
-                    self.metrics
-                        .current_physical_size_gauge
-                        .add(downloaded_size);
+                    self.layers.write().unwrap().insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
-                    // Create a DeltaLayer struct for each delta file.
+                    // Create a RemoteLayer for the delta file.
                     // The end-LSN is exclusive, while disk_consistent_lsn is
                     // inclusive. For example, if disk_consistent_lsn is 100, it is
                     // OK for a delta layer to have end LSN 101, but if the end LSN
@@ -1122,29 +1260,19 @@ impl Timeline {
                     // before crash.
                     if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
                         warn!(
-                        "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
-                        deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
-                    );
+                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
+                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
+                        );
                         continue;
                     }
-
-                    trace!("downloading delta file: {remote_layer_name:?}");
-                    let sz = remote_client
-                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
-                        .await
-                        .with_context(|| {
-                            format!("failed to download delta layer {remote_layer_name:?}")
-                        })?;
-                    trace!("done");
-
-                    let delta_layer =
-                        DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename);
-
-                    self.layers
-                        .write()
-                        .unwrap()
-                        .insert_historic(Arc::new(delta_layer));
-                    self.metrics.current_physical_size_gauge.add(sz);
+                    let remote_layer = RemoteLayer::new_delta(
+                        self.tenant_id,
+                        self.timeline_id,
+                        deltafilename,
+                        &remote_layer_metadata,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
+                    self.layers.write().unwrap().insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
@@ -1154,22 +1282,22 @@ impl Timeline {
         Ok(local_only_layers)
     }
 
+    /// This function will synchronize local state with what we have in remote storage.
     ///
-    /// This function will synchronize local data with what we have in remote storage.
-    /// 1. It will download missing layer files.
-    /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`.
-    /// 3. It will upload files that are missing on the remote
-    /// 4. It will update index file on the remote accordingly
-    /// TODO may be a bit cleaner to do things based on populated remote client,
-    ///     and then do things based on its upload_queue.latest_files
+    /// Steps taken:
+    /// 1. Initialize upload queue based on `index_part`.
+    /// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
+    ///    The list of layers on the remote comes from `index_part`.
+    ///    The list of local layers is given by the layer map's `iter_historic_layers()`.
+    ///    So, the layer map must have been loaded already.
+    /// 3. Schedule upload of local-only layer files (which will then also update the remote
+    ///    IndexPart to include the new layer files).
     ///
-    /// This is used during tenant attach. The layer map must have been loaded
-    /// with local filesystem contents already.
-    ///
-    /// The caller should provide IndexPart if it exists on the remote storage. If it's None,
-    /// we assume that it is missing on the remote storage, which means that we initialized
-    /// a timeline and then restarted before successful upload was performed
+    /// Refer to the `storage_sync2` module comment for more context.
     ///
+    /// # TODO
+    /// May be a bit cleaner to do things based on populated remote client,
+    /// and then do things based on its upload_queue.latest_files.
     #[instrument(skip(self, index_part, up_to_date_metadata))]
     pub async fn reconcile_with_remote(
         &self,
@@ -1199,9 +1327,10 @@ impl Timeline {
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
-
-                self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
-                    .await?
+                let local_only_filenames = self
+                    .create_remote_layers(index_part, local_layers, disk_consistent_lsn)
+                    .await?;
+                local_only_filenames
             }
             None => {
                 info!("initializing upload queue as empty");
@@ -1323,9 +1452,15 @@ impl Timeline {
 
         let calculation = async {
             let cancel = cancel.child_token();
-            spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn, cancel))
-                .await
-                .context("Failed to spawn calculation result task")?
+            tokio::task::spawn_blocking(move || {
+                // Run in a separate thread since this can do a lot of
+                // synchronous file IO without .await inbetween
+                // if there are no RemoteLayers that would require downloading.
+                let h = tokio::runtime::Handle::current();
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+            })
+            .await
+            .context("Failed to spawn calculation result task")?
         };
         let timeline_state_cancellation = async {
             loop {
@@ -1376,7 +1511,7 @@ impl Timeline {
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    pub fn calculate_logical_size(
+    async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cancel: CancellationToken,
@@ -1421,7 +1556,9 @@ impl Timeline {
         } else {
             self.metrics.logical_size_histo.start_timer()
         };
-        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn, cancel)?;
+        let logical_size = self
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
         Ok(logical_size)
@@ -1458,7 +1595,7 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
         match self.local_path() {
             Some(local_path) => {
                 debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
-                    "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
+                    "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
                 );
                 format!("{}", local_path.display())
             }
@@ -1497,7 +1634,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
-    ) -> Result<(), PageReconstructError> {
+    ) -> PageReconstructResult<()> {
         // Start from the current timeline.
         let mut timeline_owned;
         let mut timeline = self;
@@ -1524,12 +1661,12 @@ impl Timeline {
             // The function should have updated 'state'
             //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
             match result {
-                ValueReconstructResult::Complete => return Ok(()),
+                ValueReconstructResult::Complete => return PageReconstructResult::Success(()),
                 ValueReconstructResult::Continue => {
                     // If we reached an earlier cached page image, we're done.
                     if cont_lsn == cached_lsn + 1 {
                         self.metrics.materialized_page_cache_hit_counter.inc_by(1);
-                        return Ok(());
+                        return PageReconstructResult::Success(());
                     }
                     if prev_lsn <= cont_lsn {
                         // Didn't make any progress in last iteration. Error out to avoid
@@ -1562,7 +1699,10 @@ impl Timeline {
                     timeline.ancestor_lsn,
                     cont_lsn
                 );
-                let ancestor = timeline.get_ancestor_timeline()?;
+                let ancestor = match timeline.get_ancestor_timeline() {
+                    Ok(timeline) => timeline,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
@@ -1580,11 +1720,14 @@ impl Timeline {
                     // Get all the data needed to reconstruct the page version from this layer.
                     // But if we have an older cached page image, no need to go past that.
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = open_layer.get_value_reconstruct_data(
+                    result = match open_layer.get_value_reconstruct_data(
                         key,
                         lsn_floor..cont_lsn,
                         reconstruct_state,
-                    )?;
+                    ) {
+                        Ok(result) => result,
+                        Err(e) => return PageReconstructResult::from(e),
+                    };
                     cont_lsn = lsn_floor;
                     traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue;
@@ -1595,11 +1738,14 @@ impl Timeline {
                 if cont_lsn > start_lsn {
                     //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = frozen_layer.get_value_reconstruct_data(
+                    result = match frozen_layer.get_value_reconstruct_data(
                         key,
                         lsn_floor..cont_lsn,
                         reconstruct_state,
-                    )?;
+                    ) {
+                        Ok(result) => result,
+                        Err(e) => return PageReconstructResult::from(e),
+                    };
                     cont_lsn = lsn_floor;
                     traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
@@ -1609,12 +1755,24 @@ impl Timeline {
             if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
 
+                // If it's a remote layer, the caller can do the download and retry.
+                if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) {
+                    info!("need remote layer {}", layer.traversal_id());
+                    return PageReconstructResult::NeedsDownload(
+                        Weak::clone(&timeline.myself),
+                        Arc::downgrade(&remote_layer),
+                    );
+                }
+
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = layer.get_value_reconstruct_data(
+                result = match layer.get_value_reconstruct_data(
                     key,
                     lsn_floor..cont_lsn,
                     reconstruct_state,
-                )?;
+                ) {
+                    Ok(result) => result,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
                 cont_lsn = lsn_floor;
                 traversal_path.push((result, cont_lsn, layer.traversal_id()));
             } else if timeline.ancestor_timeline.is_some() {
@@ -1840,9 +1998,11 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                let (partitioning, _lsn) =
-                    self.repartition(self.initdb_lsn, self.get_compaction_target_size())?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)?
+                let (partitioning, _lsn) = self
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .await?;
+                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                    .await?
             } else {
                 // normal case, write out a L0 delta layer file.
                 let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
@@ -1979,7 +2139,7 @@ impl Timeline {
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
 
-        self.metrics.current_physical_size_gauge.add(sz);
+        self.metrics.resident_physical_size_gauge.add(sz);
         // update metrics
         self.metrics.num_persistent_files_created.inc_by(1);
         self.metrics.persistent_bytes_written.inc_by(sz);
@@ -1987,15 +2147,28 @@ impl Timeline {
         Ok((new_delta_filename, LayerFileMetadata::new(sz)))
     }
 
-    fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if partitioning_guard.1 == Lsn(0)
-            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
+    async fn repartition(
+        &self,
+        lsn: Lsn,
+        partition_size: u64,
+    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
-            let keyspace = self.collect_keyspace(lsn)?;
-            let partitioning = keyspace.partition(partition_size);
+            let partitioning_guard = self.partitioning.lock().unwrap();
+            if partitioning_guard.1 != Lsn(0)
+                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
+            {
+                // no repartitioning needed
+                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            }
+        }
+        let keyspace = self.collect_keyspace(lsn).await?;
+        let partitioning = keyspace.partition(partition_size);
+
+        let mut partitioning_guard = self.partitioning.lock().unwrap();
+        if lsn > partitioning_guard.1 {
             *partitioning_guard = (partitioning, lsn);
-            return Ok((partitioning_guard.0.clone(), lsn));
+        } else {
+            warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
         }
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
@@ -2041,7 +2214,7 @@ impl Timeline {
         Ok(false)
     }
 
-    fn create_image_layers(
+    async fn create_image_layers(
         &self,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
@@ -2068,7 +2241,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn) {
+                        let img = match self.get_download(key, lsn).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2131,7 +2304,9 @@ impl Timeline {
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
-            self.metrics.current_physical_size_gauge.add(metadata.len());
+            self.metrics
+                .resident_physical_size_gauge
+                .add(metadata.len());
             layers.insert_historic(Arc::new(l));
         }
         drop(layers);
@@ -2443,7 +2618,9 @@ impl Timeline {
             }
 
             // update the timeline's physical size
-            self.metrics.current_physical_size_gauge.add(metadata.len());
+            self.metrics
+                .resident_physical_size_gauge
+                .add(metadata.len());
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
@@ -2456,7 +2633,7 @@ impl Timeline {
         for l in deltas_to_compact {
             if let Some(path) = l.local_path() {
                 self.metrics
-                    .current_physical_size_gauge
+                    .resident_physical_size_gauge
                     .sub(path.metadata()?.len());
             }
             layer_names_to_delete.push(l.filename());
@@ -2526,7 +2703,10 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp)? {
+                match self
+                    .find_lsn_for_timestamp(pitr_timestamp)
+                    .no_ondemand_download()?
+                {
                     LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
                     LsnForTimestamp::Future(lsn) => {
                         debug!("future({})", lsn);
@@ -2743,11 +2923,11 @@ impl Timeline {
             for doomed_layer in layers_to_remove {
                 if let Some(path) = doomed_layer.local_path() {
                     self.metrics
-                        .current_physical_size_gauge
+                        .resident_physical_size_gauge
                         .sub(path.metadata()?.len());
                 }
                 layer_names_to_delete.push(doomed_layer.filename());
-                doomed_layer.delete()?;
+                doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
                 layers.remove_historic(doomed_layer);
                 result.layers_removed += 1;
             }
@@ -2778,7 +2958,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         mut data: ValueReconstructState,
-    ) -> anyhow::Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
         // Perform WAL redo if needed
         data.records.reverse();
 
@@ -2790,9 +2970,11 @@ impl Timeline {
                     key,
                     img_lsn
                 );
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
             } else {
-                bail!("base image for {} at {} not found", key, request_lsn);
+                PageReconstructResult::from(anyhow!(
+                    "base image for {key} at {request_lsn} not found"
+                ))
             }
         } else {
             // We need to do WAL redo.
@@ -2800,12 +2982,12 @@ impl Timeline {
             // If we don't have a base image, then the oldest WAL record better initialize
             // the page
             if data.img.is_none() && !data.records.first().unwrap().1.will_init() {
-                bail!(
+                PageReconstructResult::from(anyhow!(
                     "Base image for {} at {} not found, but got {} WAL records",
                     key,
                     request_lsn,
                     data.records.len()
-                );
+                ))
             } else {
                 if data.img.is_some() {
                     trace!(
@@ -2820,14 +3002,18 @@ impl Timeline {
 
                 let last_rec_lsn = data.records.last().unwrap().0;
 
-                let img = self
+                let img = match self
                     .walredo_mgr
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .context("Failed to reconstruct a page image:")?;
+                    .context("Failed to reconstruct a page image:")
+                {
+                    Ok(img) => img,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
 
                 if img.len() == page_cache::PAGE_SZ {
                     let cache = page_cache::get();
-                    cache
+                    if let Err(e) = cache
                         .memorize_materialized_page(
                             self.tenant_id,
                             self.timeline_id,
@@ -2835,30 +3021,324 @@ impl Timeline {
                             last_rec_lsn,
                             &img,
                         )
-                        .context("Materialized page memoization failed")?;
+                        .context("Materialized page memoization failed")
+                    {
+                        return PageReconstructResult::from(e);
+                    }
                 }
 
-                Ok(img)
+                PageReconstructResult::Success(img)
             }
         }
     }
+
+    /// Download a layer file from remote storage and insert it into the layer map.
+    ///
+    /// It's safe to call this function for the same layer concurrently. In that case:
+    /// - If the layer has already been downloaded, `OK(...)` is returned.
+    /// - If the layer is currently being downloaded, we wait until that download succeeded / failed.
+    ///     - If it succeeded, we return `Ok(...)`.
+    ///     - If it failed, we or another concurrent caller will initiate a new download attempt.
+    ///
+    /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function.
+    /// It has an internal limit for the maximum number of retries and prints appropriate log messages.
+    /// If we exceed the limit, it returns an error, and this function passes it through.
+    /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it.
+    /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas
+    /// the underlying RemoteTimelineClient can.
+    ///
+    /// There is no internal timeout or slowness detection.
+    /// If the caller has a deadline or needs a timeout, they can simply stop polling:
+    /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
+    /// So, the current download attempt will run to completion even if we stop polling.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
+    pub async fn download_remote_layer(
+        self: Arc<Self>,
+        remote_layer: Arc<RemoteLayer>,
+    ) -> anyhow::Result<()> {
+        let permit = match Arc::clone(&remote_layer.ongoing_download)
+            .acquire_owned()
+            .await
+        {
+            Ok(permit) => permit,
+            Err(_closed) => {
+                info!("download of layer has already finished");
+                return Ok(());
+            }
+        };
+
+        let (sender, receiver) = tokio::sync::oneshot::channel();
+        // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline.
+        task_mgr::spawn(
+            &tokio::runtime::Handle::current(),
+            TaskKind::RemoteDownloadTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            &format!("download layer {}", remote_layer.short_id()),
+            false,
+            async move {
+                let remote_client = self.remote_client.as_ref().unwrap();
+
+                // Does retries + exponential back-off internally.
+                // When this fails, don't layer further retry attempts here.
+                let result = remote_client
+                    .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
+                    .await;
+
+                if let Ok(size) = &result {
+                    // XXX the temp file is still around in Err() case
+                    // and consumes space until we clean up upon pageserver restart.
+                    self.metrics.resident_physical_size_gauge.add(*size);
+
+                    // Download complete. Replace the RemoteLayer with the corresponding
+                    // Delta- or ImageLayer in the layer map.
+                    let new_layer = remote_layer.create_downloaded_layer(self.conf, *size);
+                    let mut layers = self.layers.write().unwrap();
+                    {
+                        let l: Arc<dyn PersistentLayer> = remote_layer.clone();
+                        layers.remove_historic(l);
+                    }
+                    layers.insert_historic(new_layer);
+                    drop(layers);
+
+                    // Now that we've inserted the download into the layer map,
+                    // close the semaphore. This will make other waiters for
+                    // this download return Ok(()).
+                    assert!(!remote_layer.ongoing_download.is_closed());
+                    remote_layer.ongoing_download.close();
+                } else {
+                    // Keep semaphore open. We'll drop the permit at the end of the function.
+                }
+
+                // Don't treat it as an error if the task that triggered the download
+                // is no longer interested in the result.
+                sender.send(result.map(|_sz| ())).ok();
+
+                // In case we failed and there are other waiters, this will make one
+                // of them retry the download in a new task.
+                // XXX: This resets the exponential backoff because it's a new call to
+                // download_layer file.
+                drop(permit);
+
+                Ok(())
+            },
+        );
+
+        receiver.await.context("download task cancelled")?
+    }
+
+    pub async fn spawn_download_all_remote_layers(
+        self: Arc<Self>,
+    ) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
+        let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap();
+        if let Some(st) = &*status_guard {
+            match &st.state {
+                DownloadRemoteLayersTaskState::Running => {
+                    return Err(st.clone());
+                }
+                DownloadRemoteLayersTaskState::ShutDown
+                | DownloadRemoteLayersTaskState::Completed => {
+                    *status_guard = None;
+                }
+            }
+        }
+
+        let self_clone = Arc::clone(&self);
+        let task_id = task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::DownloadAllRemoteLayers,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "download all remote layers task",
+            false,
+            async move {
+                self_clone.download_all_remote_layers().await;
+                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
+                 match &mut *status_guard {
+                    None => {
+                        warn!("tasks status is supposed to be Some(), since we are running");
+                    }
+                    Some(st) => {
+                        let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap());
+                        if st.task_id != exp_task_id {
+                            warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id);
+                        } else {
+                            st.state = DownloadRemoteLayersTaskState::Completed;
+                        }
+                    }
+                };
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
+
+        let initial_info = DownloadRemoteLayersTaskInfo {
+            task_id: format!("{task_id}"),
+            state: DownloadRemoteLayersTaskState::Running,
+            total_layer_count: 0,
+            successful_download_count: 0,
+            failed_download_count: 0,
+        };
+        *status_guard = Some(initial_info.clone());
+
+        Ok(initial_info)
+    }
+
+    async fn download_all_remote_layers(self: &Arc<Self>) {
+        let mut downloads: FuturesUnordered<_> = {
+            let layers = self.layers.read().unwrap();
+            layers
+                .iter_historic_layers()
+                .filter_map(|l| l.downcast_remote_layer())
+                .map({
+                    |l| {
+                        let self_clone = Arc::clone(self);
+                        self_clone.download_remote_layer(l)
+                    }
+                })
+                .collect()
+        };
+
+        macro_rules! lock_status {
+            ($st:ident) => {
+                let mut st = self.download_all_remote_layers_task_info.write().unwrap();
+                let st = st
+                    .as_mut()
+                    .expect("this function is only called after the task has been spawned");
+                assert_eq!(
+                    st.task_id,
+                    format!(
+                        "{}",
+                        task_mgr::current_task_id().expect("we run inside a task_mgr task")
+                    )
+                );
+                let $st = st;
+            };
+        }
+
+        {
+            lock_status!(st);
+            st.total_layer_count = downloads.len().try_into().unwrap();
+        }
+        loop {
+            tokio::select! {
+                dl = downloads.next() => {
+                    lock_status!(st);
+                    match dl {
+                        None => break,
+                        Some(Ok(())) => {
+                            st.successful_download_count += 1;
+                        },
+                        Some(Err(e)) => {
+                            error!(error = %e, "layer download failed");
+                            st.failed_download_count += 1;
+                        }
+                    }
+                }
+                _ = task_mgr::shutdown_watcher() => {
+                    // Kind of pointless to watch for shutdowns here,
+                    // as download_remote_layer spawns other task_mgr tasks internally.
+                    lock_status!(st);
+                    st.state = DownloadRemoteLayersTaskState::ShutDown;
+                }
+            }
+        }
+        {
+            lock_status!(st);
+            st.state = DownloadRemoteLayersTaskState::Completed;
+        }
+    }
+
+    pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
+        self.download_all_remote_layers_task_info
+            .read()
+            .unwrap()
+            .clone()
+    }
 }
 
-/// An error happened in a get() operation.
-#[derive(thiserror::Error)]
-pub enum PageReconstructError {
-    #[error(transparent)]
-    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
-
-    #[error(transparent)]
-    WalRedo(#[from] crate::walredo::WalRedoError),
-}
-
-impl std::fmt::Debug for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
-        match self {
-            PageReconstructError::Other(err) => err.fmt(f),
-            PageReconstructError::WalRedo(err) => err.fmt(f),
+/// Helper function to deal with [`PageReconstructResult`].
+///
+/// Takes a sync closure that returns a [`PageReconstructResult`].
+/// If it is [`PageReconstructResult::NeedsDownload`],
+/// do the download and retry the closure.
+///
+/// ### Background
+///
+/// This is a crutch to make on-demand downloads efficient in
+/// our async-sync-async sandwich codebase. Some context:
+///
+/// - The code that does the downloads uses async Rust.
+/// - The code that initiates download is many levels of sync Rust.
+/// - The sync code must wait for the download to finish to
+///   make further progress.
+/// - The sync code is invoked directly from async functions upstack.
+///
+/// Example (there are also much worse ones where the sandwich is taller)
+///
+///   async handle_get_page_at_lsn_request        page_service.rs
+///     sync get_rel_page_at_lsn                  timeline.rs
+///       sync timeline.get                       timeline.rs
+///         sync get_reconstruct_data             timeline.rs
+///           async download_remote_layer         timeline.rs
+///
+/// It is not possible to Timeline::download_remote_layer().await within
+/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`]
+/// which contains references to the [`Timeline`] and [`RemoteLayer`].
+/// We bubble that error upstack to the async code, which can then call
+/// `Timeline::download_remote_layer().await`.
+/// That is _efficient_ because tokio can use the same OS thread to do
+/// other work while we're waiting for the download.
+///
+/// It is a deliberate decision to use a new result type to communicate
+/// the need for download instead of adding another variant to [`PageReconstructError`].
+/// The reason is that with the latter approach, any place that does
+/// `?` on a `Result<T, PageReconstructError>` will implicitly ignore the
+/// need for download. We want that to be explicit, so that
+/// - the code base becomes greppable for places that don't do a download
+/// - future code changes will need to explicilty address for on-demand download
+///
+/// Alternatives to consider in the future:
+///
+/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread
+///   and use it to block_on the download_remote_layer future.
+///   That is obviously inefficient as it creates one thread per download.
+/// - Convert everything to async. The problem here is that the sync
+///   functions are used by many other sync functions. So, the scope
+///   creep of such a conversion is tremendous.
+/// - Compromise between the two: implement async functions for each sync
+///   function. Switch over the hot code paths (GetPage()) to use the
+///   async path, so that the hot path doesn't  spawn threads. Other code
+///   paths would remain sync initially, and get converted to async over time.
+///
+pub async fn with_ondemand_download<F, T>(mut f: F) -> Result<T, anyhow::Error>
+where
+    F: Send + FnMut() -> PageReconstructResult<T>,
+    T: Send,
+{
+    loop {
+        let closure_result = f();
+        match closure_result {
+            PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => {
+                // if the timeline is gone, it has likely been deleted / tenant detached
+                let tl = weak_timeline.upgrade().context("timeline is gone")?;
+                // if the remote layer got removed, retry the function, it might succeed now
+                let remote_layer = match weak_remote_layer.upgrade() {
+                    None => {
+                        info!("remote layer is gone, retrying closure");
+                        continue;
+                    }
+                    Some(l) => l,
+                };
+                // Does retries internally
+                tl.download_remote_layer(remote_layer).await?;
+                // Download successful, retry the closure
+                continue;
+            }
+            PageReconstructResult::Success(closure_value) => return Ok(closure_value),
+            PageReconstructResult::Error(e) => {
+                return Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
+            }
         }
     }
 }
@@ -2868,7 +3348,7 @@ impl std::fmt::Debug for PageReconstructError {
 fn layer_traversal_error(
     msg: String,
     path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
-) -> Result<(), PageReconstructError> {
+) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
@@ -2885,7 +3365,7 @@ fn layer_traversal_error(
 
     // Append all subsequent traversals, and the error message 'msg', as contexts.
     let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    Err(PageReconstructError::Other(msg))
+    PageReconstructResult::from(msg)
 }
 
 /// Various functions to mutate the timeline.
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 46e4acd50c..fb216123c1 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -12,7 +12,7 @@
 //!
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
 use once_cell::sync::OnceCell;
-use std::fs::{File, OpenOptions};
+use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -240,6 +240,10 @@ impl VirtualFile {
         self.with_file("fsync", |file| file.sync_all())?
     }
 
+    pub fn metadata(&self) -> Result<fs::Metadata, Error> {
+        self.with_file("metadata", |file| file.metadata())?
+    }
+
     /// Helper function that looks up the underlying File for this VirtualFile,
     /// opening it and evicting some other File if necessary. It calls 'func'
     /// with the physical File.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e8a2e99f06..e3453dfe06 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,7 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
+use crate::tenant::PageReconstructResult;
 use crate::tenant::Timeline;
+use crate::try_no_ondemand_download;
+use crate::try_page_reconstruct_result as try_prr;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -52,10 +55,10 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result<WalIngest> {
+    pub fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint)?;
+        let checkpoint_bytes = timeline.get_checkpoint(startpoint).no_ondemand_download()?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -80,10 +83,12 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         modification: &mut DatadirModification,
         decoded: &mut DecodedWALRecord,
-    ) -> Result<()> {
+    ) -> PageReconstructResult<()> {
         modification.lsn = lsn;
-        decode_wal_record(recdata, decoded, self.timeline.pg_version)
-            .context("failed decoding wal record")?;
+        try_prr!(
+            decode_wal_record(recdata, decoded, self.timeline.pg_version)
+                .context("failed decoding wal record")
+        );
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -98,7 +103,7 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)?;
+            try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded));
         }
         // Handle other special record types
         if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -106,13 +111,13 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create)?;
+            try_prr!(self.ingest_xlog_smgr_create(modification, &create));
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)?;
+            try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate));
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
                 "handle RM_DBASE_ID for Postgres version {:?}",
@@ -125,14 +130,14 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)?;
+                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
+                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -148,14 +153,14 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)?;
+                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
+                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
                     }
                 }
             }
@@ -167,38 +172,38 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
+                try_prr!(self.put_slru_page_image(
                     modification,
                     SlruKind::Clog,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                ));
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)?;
+                try_prr!(self.ingest_clog_truncate_record(modification, &xlrec));
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
             let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
             if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                self.ingest_xact_record(
+                try_prr!(self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
-                )?;
+                ));
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                 || info == pg_constants::XLOG_XACT_ABORT_PREPARED
             {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                self.ingest_xact_record(
+                try_prr!(self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                )?;
+                ));
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
                 trace!(
                     "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
@@ -206,9 +211,10 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                modification.drop_twophase_file(parsed_xact.xid)?;
+                try_prr!(modification.drop_twophase_file(parsed_xact.xid));
             } else if info == pg_constants::XLOG_XACT_PREPARE {
-                modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
+                try_prr!(modification
+                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..])));
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -217,34 +223,34 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
+                try_prr!(self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactOffsets,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                ));
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
+                try_prr!(self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactMembers,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                ));
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                 let xlrec = XlMultiXactCreate::decode(&mut buf);
-                self.ingest_multixact_create_record(modification, &xlrec)?;
+                try_prr!(self.ingest_multixact_create_record(modification, &xlrec));
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec)?;
+                try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec));
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded)?;
+            try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded));
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
             if info == pg_constants::XLOG_NEXTOID {
@@ -258,7 +264,9 @@ impl<'a> WalIngest<'a> {
             {
                 let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
                 buf.copy_to_slice(&mut checkpoint_bytes);
-                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+                let xlog_checkpoint = try_prr!(
+                    CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint")
+                );
                 trace!(
                     "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
                     xlog_checkpoint.oldestXid,
@@ -279,22 +287,23 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)?;
+            try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk));
         }
 
         // If checkpoint data was updated, store the new version in the repository
         if self.checkpoint_modified {
-            let new_checkpoint_bytes = self.checkpoint.encode()?;
+            let new_checkpoint_bytes =
+                try_prr!(self.checkpoint.encode().context("encode checkpoint"));
 
-            modification.put_checkpoint(new_checkpoint_bytes)?;
+            try_prr!(modification.put_checkpoint(new_checkpoint_bytes));
             self.checkpoint_modified = false;
         }
 
         // Now that this record has been fully handled, including updating the
         // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        try_prr!(modification.commit());
 
-        Ok(())
+        PageReconstructResult::Success(())
     }
 
     fn ingest_decoded_block(
@@ -303,7 +312,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
-    ) -> Result<()> {
+    ) -> PageReconstructResult<()> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
@@ -323,7 +332,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
         // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version))
         {
             // Extract page image from FPI record
             let img_len = blk.bimg_len as usize;
@@ -345,15 +354,20 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
+            try_no_ondemand_download!(self.put_rel_page_image(
+                modification,
+                rel,
+                blk.blkno,
+                image.freeze()
+            ));
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)?;
+            try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec));
         }
-        Ok(())
+        PageReconstructResult::Success(())
     }
 
     fn ingest_heapam_record(
@@ -505,7 +519,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         rec: &XlCreateDatabase,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
         let src_db_id = rec.src_db_id;
@@ -520,14 +534,16 @@ impl<'a> WalIngest<'a> {
 
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)?;
+            .list_rels(src_tablespace_id, src_db_id, req_lsn)
+            .no_ondemand_download()?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
 
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?;
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+            .no_ondemand_download()?;
         modification.put_relmap_file(tablespace_id, db_id, filemap)?;
 
         let mut num_rels_copied = 0;
@@ -536,7 +552,10 @@ impl<'a> WalIngest<'a> {
             assert_eq!(src_rel.spcnode, src_tablespace_id);
             assert_eq!(src_rel.dbnode, src_db_id);
 
-            let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?;
+            let nblocks = modification
+                .tline
+                .get_rel_size(src_rel, req_lsn, true)
+                .no_ondemand_download()?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
                 dbnode: db_id,
@@ -553,7 +572,8 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?;
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                    .no_ondemand_download()?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
             }
@@ -657,7 +677,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
         let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -713,7 +733,11 @@ impl<'a> WalIngest<'a> {
                     relnode: xnode.relnode,
                 };
                 let last_lsn = self.timeline.get_last_record_lsn();
-                if modification.tline.get_rel_exists(rel, last_lsn, true)? {
+                if modification
+                    .tline
+                    .get_rel_exists(rel, last_lsn, true)
+                    .no_ondemand_download()?
+                {
                     self.put_rel_drop(modification, rel)?;
                 }
             }
@@ -725,7 +749,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         xlrec: &XlClogTruncate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
             xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
@@ -767,7 +791,8 @@ impl<'a> WalIngest<'a> {
         let req_lsn = modification.tline.get_last_record_lsn();
         for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)?
+            .list_slru_segments(SlruKind::Clog, req_lsn)
+            .no_ondemand_download()?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
@@ -923,10 +948,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)?;
-        modification.put_rel_page_image(rel, blknum, img)?;
-        Ok(())
+    ) -> PageReconstructResult<()> {
+        try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum));
+        try_prr!(modification.put_rel_page_image(rel, blknum, img));
+        PageReconstructResult::Success(())
     }
 
     fn put_rel_wal_record(
@@ -936,7 +961,8 @@ impl<'a> WalIngest<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)?;
+        self.handle_rel_extend(modification, rel, blknum)
+            .no_ondemand_download()?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -946,7 +972,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         rel: RelTag,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         modification.put_rel_truncation(rel, nblocks)?;
         Ok(())
     }
@@ -956,11 +982,17 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
-        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? {
+    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self
+            .timeline
+            .get_rel_exists(rel, lsn, true)
+            .no_ondemand_download()?
+        {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true)?
+            self.timeline
+                .get_rel_size(rel, lsn, true)
+                .no_ondemand_download()?
         };
         Ok(nblocks)
     }
@@ -970,30 +1002,31 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> Result<()> {
+    ) -> PageReconstructResult<()> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? {
-            // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0)?;
-            0
-        } else {
-            self.timeline.get_rel_size(rel, last_lsn, true)?
-        };
+        let old_nblocks =
+            if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) {
+                // create it with 0 size initially, the logic below will extend it
+                try_prr!(modification.put_rel_creation(rel, 0));
+                0
+            } else {
+                try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true))
+            };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            modification.put_rel_extend(rel, new_nblocks)?;
+            try_prr!(modification.put_rel_extend(rel, new_nblocks));
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone()));
             }
         }
-        Ok(())
+        PageReconstructResult::Success(())
     }
 
     fn put_slru_page_image(
@@ -1015,7 +1048,7 @@ impl<'a> WalIngest<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
         // a lot less frequently.
@@ -1027,13 +1060,16 @@ impl<'a> WalIngest<'a> {
         let last_lsn = self.timeline.get_last_record_lsn();
         let old_nblocks = if !self
             .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)?
+            .get_slru_segment_exists(kind, segno, last_lsn)
+            .no_ondemand_download()?
         {
             // create it with 0 size initially, the logic below will extend it
             modification.put_slru_segment_creation(kind, segno, 0)?;
             0
         } else {
-            self.timeline.get_slru_segment_size(kind, segno, last_lsn)?
+            self.timeline
+                .get_slru_segment_size(kind, segno, last_lsn)
+                .no_ondemand_download()?
         };
 
         if new_nblocks > old_nblocks {
@@ -1099,58 +1135,103 @@ mod tests {
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .no_ondemand_download()?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .no_ondemand_download()?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .no_ondemand_download()?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .no_ondemand_download()?;
         m.commit()?;
 
         assert_current_logical_size(&*tline, Lsn(0x50));
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false);
-        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err());
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false)
+                .no_ondemand_download()?,
+            false
+        );
+        assert!(tline
+            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .no_ondemand_download()
+            .is_err());
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 2 at 5")
         );
 
@@ -1161,20 +1242,36 @@ mod tests {
         assert_current_logical_size(&*tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false)
+                .no_ondemand_download()?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 2 at 5")
         );
 
@@ -1182,35 +1279,62 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x68));
         walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false)
+                .no_ondemand_download()?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .no_ondemand_download()?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x70), false)
+                .no_ondemand_download()?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .no_ondemand_download()?,
             ZERO_PAGE
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1")
         );
 
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .no_ondemand_download()?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .no_ondemand_download()?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1500")
         );
 
@@ -1226,12 +1350,24 @@ mod tests {
         let mut walingest = init_walingest_test(&*tline)?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .no_ondemand_download()?;
         m.commit()?;
 
         // Check that rel exists and size is correct
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
@@ -1239,19 +1375,36 @@ mod tests {
         m.commit()?;
 
         // Check that rel is not visible anymore
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false)
+                .no_ondemand_download()?,
+            false
+        );
 
         // FIXME: should fail
         //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none());
 
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .no_ondemand_download()?;
         m.commit()?;
 
         // Check that rel exists and size is correct
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         Ok(())
     }
@@ -1270,23 +1423,45 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x20));
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .no_ondemand_download()?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false);
-        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err());
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false)
+                .no_ondemand_download()?,
+            false
+        );
+        assert!(tline
+            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .no_ondemand_download()
+            .is_err());
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            relsize
+        );
 
         // Check relation content
         for blkno in 0..relsize {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1298,24 +1473,38 @@ mod tests {
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
 
         // should still see all blocks with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            relsize
+        );
         for blkno in 0..relsize {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1326,18 +1515,32 @@ mod tests {
         let mut m = tline.begin_modification(lsn);
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .no_ondemand_download()?;
         }
         m.commit()?;
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            relsize
+        );
         // Check relation content
         for blkno in 0..relsize {
             let lsn = Lsn(0x80);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1358,14 +1561,18 @@ mod tests {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .no_ondemand_download()?;
             m.commit()?;
         }
 
         assert_current_logical_size(&*tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
             RELSEG_SIZE + 1
         );
 
@@ -1374,7 +1581,12 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
+            RELSEG_SIZE
+        );
         assert_current_logical_size(&*tline, Lsn(lsn));
 
         // Truncate another block
@@ -1383,7 +1595,9 @@ mod tests {
         walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&*tline, Lsn(lsn));
@@ -1397,7 +1611,9 @@ mod tests {
             walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+                tline
+                    .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                    .no_ondemand_download()?,
                 size as BlockNumber
             );
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index a65703bca9..aeb7601af7 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -407,7 +407,7 @@ impl WalreceiverState {
                 .await
                 .context("walreceiver connection handling failure")
             }
-            .instrument(info_span!("walreceiver_connection", id = %id))
+            .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
         });
 
         let now = Utc::now().naive_utc();
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index 5b7e60aa5e..cc318cccc8 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
+use crate::{
+    metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
+};
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -248,9 +250,16 @@ pub async fn handle_walreceiver_connection(
                         // at risk of hitting a deadlock.
                         ensure!(lsn.is_aligned());
 
-                        walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                            .context("could not ingest record at {lsn}")?;
+                        with_ondemand_download(|| {
+                            walingest.ingest_record(
+                                recdata.clone(),
+                                lsn,
+                                &mut modification,
+                                &mut decoded,
+                            )
+                        })
+                        .await
+                        .with_context(|| format!("could not ingest record at {lsn}"))?;
 
                         fail_point!("walreceiver-after-ingest");
 
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 38fb9a4247..7581140934 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -1,6 +1,7 @@
 //!
 //! Functions for parsing WAL records.
 //!
+
 use anyhow::Result;
 use bytes::{Buf, Bytes};
 use postgres_ffi::pg_constants;
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 8ea3f13bf5..d83a74ae14 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -318,14 +318,8 @@ def remote_consistent_lsn(
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
 
     lsn_str = detail["remote_consistent_lsn"]
-    if lsn_str is None:
-        # No remote information at all. This happens right after creating
-        # a timeline, before any part of it has been uploaded to remote
-        # storage yet.
-        return 0
-    else:
-        assert isinstance(lsn_str, str)
-        return lsn_from_hex(lsn_str)
+    assert isinstance(lsn_str, str)
+    return lsn_from_hex(lsn_str)
 
 
 def wait_for_upload(
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 5fe6c43528..9236137d19 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -49,7 +49,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
-    "pageserver_current_physical_size",
+    "pageserver_resident_physical_size",
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d52ca38447..5b00ebdea7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -26,6 +26,7 @@ import asyncpg
 import backoff  # type: ignore
 import boto3
 import jwt
+import prometheus_client
 import psycopg2
 import pytest
 import requests
@@ -41,6 +42,7 @@ from fixtures.utils import (
     get_self_dir,
     subprocess_capture,
 )
+from prometheus_client.parser import text_string_to_metric_families
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -1204,8 +1206,22 @@ class PageserverHttpClient(requests.Session):
         # there are no tests for those right now.
         return size
 
-    def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline")
+    def timeline_list(
+        self,
+        tenant_id: TenantId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+    ) -> List[Dict[str, Any]]:
+
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "yes"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "yes"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
+        )
         self.verbose_error(res)
         res_json = res.json()
         assert isinstance(res_json, list)
@@ -1239,13 +1255,13 @@ class PageserverHttpClient(requests.Session):
         tenant_id: TenantId,
         timeline_id: TimelineId,
         include_non_incremental_logical_size: bool = False,
-        include_non_incremental_physical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
     ) -> Dict[Any, Any]:
         params = {}
         if include_non_incremental_logical_size:
             params["include-non-incremental-logical-size"] = "yes"
-        if include_non_incremental_physical_size:
-            params["include-non-incremental-physical-size"] = "yes"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "yes"
 
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
@@ -1320,11 +1336,88 @@ class PageserverHttpClient(requests.Session):
         res_json = res.json()
         assert res_json is None
 
+    def timeline_spawn_download_remote_layers(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> dict[str, Any]:
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        spawn_response: dict[str, Any],
+        poll_state=None,
+    ) -> None | dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+
+        # assumption in this API client here is that nobody else spawns the task
+        assert res_json["task_id"] == spawn_response["task_id"]
+
+        if poll_state is None or res_json["state"] == poll_state:
+            return res_json
+        return None
+
+    def timeline_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        errors_ok=False,
+        at_least_one_download=True,
+    ):
+        res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id)
+        while True:
+            completed = self.timeline_poll_download_remote_layers_status(
+                tenant_id, timeline_id, res, poll_state="Completed"
+            )
+            if not completed:
+                time.sleep(0.1)
+                continue
+            if not errors_ok:
+                assert completed["failed_download_count"] == 0
+            if at_least_one_download:
+                assert completed["successful_download_count"] > 0
+            return completed
+
     def get_metrics(self) -> str:
         res = self.get(f"http://localhost:{self.port}/metrics")
         self.verbose_error(res)
         return res.text
 
+    def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str):
+        raw = self.get_metrics()
+        family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw))
+        [metric] = [m for m in family if m.name == metric_name]
+        [sample] = [
+            s
+            for s in metric.samples
+            if s.labels["tenant_id"] == str(tenant_id)
+            and s.labels["timeline_id"] == str(timeline_id)
+        ]
+        return sample.value
+
+    def get_metric_value(self, name: str) -> Optional[str]:
+        metrics = self.get_metrics()
+        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
+        if len(relevant) == 0:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(relevant) == 1
+        return relevant[0].lstrip(name).strip()
+
 
 @dataclass
 class PageserverPort:
@@ -1622,7 +1715,12 @@ class NeonCli(AbstractNeonCli):
                 pageserver_config_override=self.env.pageserver.config_override,
             )
 
-            res = self.raw_cli(cmd)
+            s3_env_vars = None
+            if self.env.remote_storage is not None and isinstance(
+                self.env.remote_storage, S3Storage
+            ):
+                s3_env_vars = self.env.remote_storage.access_env_vars()
+            res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
             res.check_returncode()
             return res
 
@@ -2996,13 +3094,55 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def assert_no_in_progress_downloads_for_tenant(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
+def wait_until(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns successfully, without exception. Returns the
+    last return value from the function.
+    """
+    last_exception = None
+    for i in range(number_of_iterations):
+        try:
+            res = func()
+        except Exception as e:
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            last_exception = e
+            time.sleep(interval)
+            continue
+        return res
+    raise Exception("timed out while waiting for %s" % func) from last_exception
+
+
+def wait_while(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns false, or throws an exception.
+    """
+    for i in range(number_of_iterations):
+        try:
+            if not func():
+                return
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            time.sleep(interval)
+            continue
+        except Exception:
+            return
+    raise Exception("timed out while waiting for %s" % func)
+
+
+def assert_tenant_status(
+    pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str
 ):
     tenant_status = pageserver_http_client.tenant_status(tenant)
-    assert tenant_status["has_in_progress_downloads"] is False, tenant_status
-    assert tenant_status["state"] == "Active"
+    log.info(f"tenant_status: {tenant_status}")
+    assert tenant_status["state"] == expected_status, tenant_status
+
+
+def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId):
+    tenants = ps_http.tenant_list()
+    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
+    assert len(matching) < 2
+    if len(matching) == 0:
+        return None
+    return matching[0]
 
 
 def remote_consistent_lsn(
@@ -3010,14 +3150,15 @@ def remote_consistent_lsn(
 ) -> Lsn:
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
 
-    lsn_str = detail["remote_consistent_lsn"]
-    if lsn_str is None:
+    if detail["remote_consistent_lsn"] is None:
         # No remote information at all. This happens right after creating
         # a timeline, before any part of it has been uploaded to remote
         # storage yet.
         return Lsn(0)
-    assert isinstance(lsn_str, str)
-    return Lsn(lsn_str)
+    else:
+        lsn_str = detail["remote_consistent_lsn"]
+        assert isinstance(lsn_str, str)
+        return Lsn(lsn_str)
 
 
 def wait_for_upload(
@@ -3030,6 +3171,7 @@ def wait_for_upload(
     for i in range(20):
         current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
         if current_lsn >= lsn:
+            log.info("wait finished")
             return
         log.info(
             "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 71964f622f..05d5788028 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*Failed to load delta layer.*",
+            ".*Failed to reconstruct the page.*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
         f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
     )
 
-    # Second timeline has no ancestors, only the metadata file and no layer files.
-    # That is checked explicitly in the pageserver, and causes the tenant to be marked
-    # as broken.
+    # Second timeline has no ancestors, only the metadata file and no layer files locally,
+    # and we don't have the remote storage enabled. It is loaded into memory, but getting
+    # the basebackup from it will fail.
     with pytest.raises(
         Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
     ) as err:
@@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
     log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
 
     # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
-    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+    with pytest.raises(Exception, match="Failed to reconstruct the page") as err:
         pg3.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 7f86d92962..fa1bf0fbb2 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -37,7 +37,7 @@ def metrics_handler(request: Request) -> Response:
 
     checks = {
         "written_size": lambda value: value > 0,
-        "physical_size": lambda value: value >= 0,
+        "resident_size": lambda value: value >= 0,
         # >= 0 check here is to avoid race condition when we receive metrics before
         # remote_uploaded is updated
         "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
new file mode 100644
index 0000000000..352ae4b95c
--- /dev/null
+++ b/test_runner/regress/test_ondemand_download.py
@@ -0,0 +1,437 @@
+# It's possible to run any regular test with the local fs remote storage via
+# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
+
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    available_remote_storages,
+    wait_for_last_record_lsn,
+    wait_for_sk_commit_lsn_to_reach_remote_storage,
+    wait_for_upload,
+    wait_until,
+)
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar
+
+
+def get_num_downloaded_layers(client, tenant_id, timeline_id):
+    value = client.get_metric_value(
+        f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}'
+    )
+    if value is None:
+        return 0
+    return int(value)
+
+
+#
+# If you have a large relation, check that the pageserver downloads parts of it as
+# require by queries.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_ondemand_download_large_rel(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_large_rel",
+    )
+
+    ##### First start, insert secret data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable background GC
+            "gc_period": "10 m",
+            "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            "compaction_threshold": "3",
+            "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    # We want to make sure that the data is large enough that the keyspace is partitioned.
+    num_rows = 1000000
+
+    with pg.cursor() as cur:
+        # data loading may take a while, so increase statement timeout
+        cur.execute("SET statement_timeout='300s'")
+        cur.execute(
+            f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g
+        from generate_series(1,{num_rows}) g"""
+        )
+        cur.execute("CREATE INDEX ON tbl (id)")
+        cur.execute("VACUUM tbl")
+
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+
+    # run checkpoint manually to be sure that data landed in remote storage
+    client.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    log.info("uploads have finished")
+
+    ##### Stop the first pageserver instance, erase all its data
+    pg.stop()
+    env.pageserver.stop()
+
+    # remove all the layer files
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start()
+
+    pg.start()
+    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+
+    # Probe in the middle of the table. There's a high chance that the beginning
+    # and end of the table was stored together in the same layer files with data
+    # from other tables, and with the entry that stores the size of the
+    # relation, so they are likely already downloaded. But the middle of the
+    # table should not have been needed by anything yet.
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1
+
+    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
+    assert after_downloads > before_downloads
+
+
+#
+# If you have a relation with a long history of updates,the pageserver downloads the layer
+# files containing the history as needed by timetravel queries.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_ondemand_download_timetravel(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_timetravel",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC & compaction
+            # We don't want GC, that would break the assertion about num downloads.
+            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
+            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    lsns = []
+
+    table_len = 10000
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+        CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
+        INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len});
+        """
+        )
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+    # run checkpoint manually to be sure that data landed in remote storage
+    client.timeline_checkpoint(tenant_id, timeline_id)
+    lsns.append((0, current_lsn))
+
+    for checkpoint_number in range(1, 20):
+        with pg.cursor() as cur:
+            cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+        lsns.append((checkpoint_number, current_lsn))
+
+        # wait until pageserver receives that data
+        wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+
+        # run checkpoint manually to be sure that data landed in remote storage
+        client.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    log.info("uploads have finished")
+
+    ##### Stop the first pageserver instance, erase all its data
+    env.postgres.stop_all()
+
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
+
+    def get_api_current_physical_size():
+        d = client.timeline_detail(tenant_id, timeline_id)
+        return d["current_physical_size"]
+
+    def get_resident_physical_size():
+        return client.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+
+    filled_current_physical = get_api_current_physical_size()
+    log.info(filled_current_physical)
+    filled_size = get_resident_physical_size()
+    log.info(filled_size)
+    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
+
+    env.pageserver.stop()
+
+    # remove all the layer files
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start()
+
+    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+
+    # current_physical_size reports sum of layer file sizes, regardless of local or remote
+    assert filled_current_physical == get_api_current_physical_size()
+
+    num_layers_downloaded = [0]
+    physical_size = [get_resident_physical_size()]
+    for (checkpoint_number, lsn) in lsns:
+        pg_old = env.postgres.create_start(
+            branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
+        )
+        with pg_old.cursor() as cur:
+            # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000
+            assert (
+                query_scalar(
+                    cur,
+                    f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}",
+                )
+                == 0
+            )
+            assert (
+                query_scalar(
+                    cur,
+                    f"select count(*) from testtab where checkpoint_number={checkpoint_number}",
+                )
+                == table_len
+            )
+
+        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+        num_layers_downloaded.append(after_downloads)
+        log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")
+
+        # Check that on each query, we need to download at least one more layer file. However in
+        # practice, thanks to compaction and the fact that some requests need to download
+        # more history, some points-in-time are covered by earlier downloads already. But
+        # in broad strokes, as we query more points-in-time, more layers need to be downloaded.
+        #
+        # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded
+        # more files than we had three iterations ago.
+        log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}")
+        if len(num_layers_downloaded) > 4:
+            assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4]
+
+        # Likewise, assert that the physical_size metric grows as layers are downloaded
+        physical_size.append(get_resident_physical_size())
+        log.info(f"physical_size[-1]={physical_size[-1]}")
+        if len(physical_size) > 4:
+            assert physical_size[-1] > physical_size[len(physical_size) - 4]
+
+        # current_physical_size reports sum of layer file sizes, regardless of local or remote
+        assert filled_current_physical == get_api_current_physical_size()
+
+
+#
+# Ensure that the `download_remote_layers` API works
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_download_remote_layers_api(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_download_remote_layers_api",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC & compaction
+            # We don't want GC, that would break the assertion about num downloads.
+            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
+            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    table_len = 10000
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+        CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
+        INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len});
+        """
+        )
+
+    env.postgres.stop_all()
+
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
+
+    def get_api_current_physical_size():
+        d = client.timeline_detail(tenant_id, timeline_id)
+        return d["current_physical_size"]
+
+    def get_resident_physical_size():
+        return client.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+
+    filled_current_physical = get_api_current_physical_size()
+    log.info(filled_current_physical)
+    filled_size = get_resident_physical_size()
+    log.info(filled_size)
+    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
+
+    env.pageserver.stop()
+
+    # remove all the layer files
+    # XXX only delete some of the layer files, to show that it really just downloads all the layers
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    # Shut down safekeepers before starting the pageserver.
+    # If we don't, the tenant's walreceiver handler will trigger the
+    # the logical size computation task, and that downloads layes,
+    # which makes our assertions on size fail.
+    for sk in env.safekeepers:
+        sk.stop(immediate=True)
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
+    env.pageserver.allowed_errors.extend(
+        [
+            f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
+        ]
+    )
+
+    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+
+    ###### Phase 1: exercise download error code path
+    assert (
+        filled_current_physical == get_api_current_physical_size()
+    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
+    post_unlink_size = get_resident_physical_size()
+    log.info(post_unlink_size)
+    assert (
+        post_unlink_size < filled_size
+    ), "we just deleted layers and didn't cause anything to re-download them yet"
+    assert filled_size - post_unlink_size > 5 * (
+        1024**2
+    ), "we may be downloading some layers as part of tenant activation"
+
+    # issue downloads that we know will fail
+    info = client.timeline_download_remote_layers(
+        tenant_id, timeline_id, errors_ok=True, at_least_one_download=False
+    )
+    log.info(f"info={info}")
+    assert info["state"] == "Completed"
+    assert info["total_layer_count"] > 0
+    assert info["successful_download_count"] == 0
+    assert (
+        info["failed_download_count"] > 0
+    )  # can't assert == total_layer_count because attach + tenant status downloads some layers
+    assert (
+        info["total_layer_count"]
+        == info["successful_download_count"] + info["failed_download_count"]
+    )
+    assert get_api_current_physical_size() == filled_current_physical
+    assert (
+        get_resident_physical_size() == post_unlink_size
+    ), "didn't download anything new due to failpoint"
+    # would be nice to assert that the layers in the layer map are still RemoteLayer
+
+    ##### Retry, this time without failpoints
+    client.configure_failpoints(("remote-storage-download-pre-rename", "off"))
+    info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False)
+    log.info(f"info={info}")
+
+    assert info["state"] == "Completed"
+    assert info["total_layer_count"] > 0
+    assert info["successful_download_count"] > 0
+    assert info["failed_download_count"] == 0
+    assert (
+        info["total_layer_count"]
+        == info["successful_download_count"] + info["failed_download_count"]
+    )
+
+    refilled_size = get_resident_physical_size()
+    log.info(refilled_size)
+
+    assert filled_size == refilled_size, "we redownloaded all the layers"
+    assert get_api_current_physical_size() == filled_current_physical
+
+    for sk in env.safekeepers:
+        sk.start()
+
+    # ensure that all the data is back
+    pg_old = env.postgres.create_start(branch_name="main")
+    with pg_old.cursor() as cur:
+        assert query_scalar(cur, "select count(*) from testtab") == table_len
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 94e483cdb5..32c25b2e8c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PageserverApiException,
     RemoteStorageKind,
-    assert_no_in_progress_downloads_for_tenant,
     available_remote_storages,
     wait_for_last_flush_lsn,
     wait_for_last_record_lsn,
@@ -62,9 +61,9 @@ def test_remote_storage_backup_and_restore(
     neon_env_builder.pageserver_config_override = "test_remote_failures=1"
 
     data_id = 1
-    data_secret = "very secret secret"
+    data = "just some data"
 
-    ##### First start, insert secret data and upload it to the remote storage
+    ##### First start, insert data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
     # FIXME: Is this expected?
@@ -97,8 +96,8 @@ def test_remote_storage_backup_and_restore(
         with pg.cursor() as cur:
             cur.execute(
                 f"""
-                CREATE TABLE t{checkpoint_number}(id int primary key, secret text);
-                INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}');
+                CREATE TABLE t{checkpoint_number}(id int primary key, data text);
+                INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}');
             """
             )
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
@@ -133,36 +132,53 @@ def test_remote_storage_backup_and_restore(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    # Introduce failpoint in download
-    pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return"))
-
+    # Introduce failpoint in list remote timelines code path to make tenant_attach fail.
+    # This is before the failures injected by test_remote_failures, so it's a permanent error.
+    pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*error attaching tenant: storage-sync-list-remote-timelines",
+    )
+    # Attach it. This HTTP request will succeed and launch a
+    # background task to load the tenant. In that background task,
+    # listing the remote timelines will fail because of the failpoint,
+    # and the tenant will be marked as Broken.
     client.tenant_attach(tenant_id)
-
-    # is there a better way to assert that failpoint triggered?
     wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
 
-    # assert cannot attach timeline that is scheduled for download
-    # FIXME implement layer download retries
+    # Ensure that even though the tenant is broken, we can't attach it again.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
         client.tenant_attach(tenant_id)
 
-    tenant_status = client.tenant_status(tenant_id)
-    log.info("Tenant status with active failpoint: %s", tenant_status)
-    # FIXME implement layer download retries
-    # assert tenant_status["has_in_progress_downloads"] is True
-
-    # trigger temporary download files removal
+    # Restart again, this implicitly clears the failpoint.
+    # test_remote_failures=1 remains active, though, as it's in the pageserver config.
+    # This means that any of the remote client operations after restart will exercise the
+    # retry code path.
+    #
+    # The initiated attach operation should survive the restart, and continue from where it was.
     env.pageserver.stop()
+    layer_download_failed_regex = (
+        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
+    )
+    assert not env.pageserver.log_contains(
+        layer_download_failed_regex
+    ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
     env.pageserver.start()
 
-    # ensure that an initiated attach operation survives pageserver restart
+    # Ensure that the pageserver remembers that the tenant was attaching, by
+    # trying to attach it again. It should fail.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
         client.tenant_attach(tenant_id)
-    log.info("waiting for timeline redownload")
+    log.info("waiting for tenant to become active. this should be quick with on-demand download")
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
     wait_until(
-        number_of_iterations=20,
+        number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=tenant_active,
     )
 
     detail = client.timeline_detail(tenant_id, timeline_id)
@@ -171,14 +187,18 @@ def test_remote_storage_backup_and_restore(
         Lsn(detail["last_record_lsn"]) >= current_lsn
     ), "current db Lsn should should not be less than the one stored on remote storage"
 
+    log.info("select some data, this will cause layers to be downloaded")
     pg = env.postgres.create_start("main")
     with pg.cursor() as cur:
         for checkpoint_number in checkpoint_numbers:
             assert (
-                query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};")
-                == f"{data_secret}|{checkpoint_number}"
+                query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};")
+                == f"{data}|{checkpoint_number}"
             )
 
+    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    assert env.pageserver.log_contains(layer_download_failed_regex)
+
 
 # Exercises the upload queue retry code paths.
 # - Use failpoints to cause all storage ops to fail
@@ -338,7 +358,6 @@ def test_remote_storage_upload_queue_retries(
     def tenant_active():
         all_states = client.tenant_list()
         [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["has_in_progress_downloads"] is False
         assert tenant["state"] == "Active"
 
     wait_until(30, 1, tenant_active)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 081fd0fc2f..1b58937e2a 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -13,12 +13,15 @@ from fixtures.neon_fixtures import (
     PageserverHttpClient,
     PortDistributor,
     Postgres,
-    assert_no_in_progress_downloads_for_tenant,
+    assert_tenant_status,
+    tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until,
+    wait_while,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until
+from fixtures.utils import query_scalar, start_in_background, subprocess_capture
 
 
 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
@@ -406,17 +409,13 @@ def test_tenant_relocation(
             # call to attach timeline to new pageserver
             new_pageserver_http.tenant_attach(tenant_id)
 
-            # check that it shows that download is in progress
+            # wait for tenant to finish attaching
             tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
-            assert tenant_status.get("has_in_progress_downloads"), tenant_status
-
-            # wait until tenant is downloaded
+            assert tenant_status["state"] in ["Attaching", "Active"]
             wait_until(
                 number_of_iterations=10,
                 interval=1,
-                func=lambda: assert_no_in_progress_downloads_for_tenant(
-                    new_pageserver_http, tenant_id
-                ),
+                func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"),
             )
 
             check_timeline_attached(
@@ -459,9 +458,15 @@ def test_tenant_relocation(
 
         # detach tenant from old pageserver before we check
         # that all the data is there to be sure that old pageserver
-        # is no longer involved, and if it is, we will see the errors
+        # is no longer involved, and if it is, we will see the error
         pageserver_http.tenant_detach(tenant_id)
 
+        # Wait a little, so that the detach operation has time to finish.
+        wait_while(
+            number_of_iterations=100,
+            interval=1,
+            func=lambda: tenant_exists(pageserver_http, tenant_id),
+        )
         post_migration_check(pg_main, 500500, old_local_path_main)
         post_migration_check(pg_second, 1001000, old_local_path_second)
 
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index ddae1a67ff..4eba4ce942 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
         matching = [t for t in all_states if TenantId(t["id"]) == tenant]
         return get_only_element(matching)["state"]
 
-    def get_metric_value(name):
-        metrics = client.get_metrics()
-        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
-        if len(relevant) == 0:
-            return 0
-        line = get_only_element(relevant)
-        value = line.lstrip(name).strip()
-        return int(value)
-
     def delete_all_timelines(tenant: TenantId):
         timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)]
         for t in timelines:
             client.timeline_delete(tenant, t)
 
+    def assert_active(tenant):
+        assert get_state(tenant) == "Active"
+
     # Create tenant, start compute
     tenant, _ = env.neon_cli.create_tenant()
     env.neon_cli.create_timeline(name, tenant_id=tenant)
     pg = env.postgres.create_start(name, tenant_id=tenant)
+    assert (
+        get_state(tenant) == "Active"
+    ), "Pageserver should activate a tenant and start background jobs if timelines are loaded"
 
     # Stop compute
     pg.stop()
 
-    # Delete all timelines on all tenants
+    # Delete all timelines on all tenants.
+    #
+    # FIXME: we used to check that the background jobs are stopped when all timelines
+    # are removed, but we don't stop them anymore. Not sure if this test still makes sense
+    # or we should just remove it.
     for tenant_info in client.tenant_list():
         tenant_id = TenantId(tenant_info["id"])
         delete_all_timelines(tenant_id)
+        wait_until(10, 0.2, lambda: assert_active(tenant_id))
 
     # Assert that all tasks finish quickly after tenant is detached
-    assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0
+    task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+    assert task_starts is not None
+    assert int(task_starts) > 0
     client.tenant_detach(tenant)
     client.tenant_detach(env.initial_tenant)
 
     def assert_tasks_finish():
-        tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
-        tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
-        tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
+        tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+        tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}')
+        tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}')
         log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
         assert tasks_started == tasks_ended
-        assert tasks_panicked == 0
+        assert tasks_panicked is None or int(tasks_panicked) == 0
 
     wait_until(10, 0.2, assert_tasks_finish)
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 4cd74e17e9..6a5b4278da 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     Postgres,
     RemoteStorageKind,
-    assert_no_in_progress_downloads_for_tenant,
+    assert_tenant_status,
     available_remote_storages,
     wait_for_last_record_lsn,
     wait_for_sk_commit_lsn_to_reach_remote_storage,
@@ -179,14 +179,6 @@ def test_tenants_attached_after_download(
         tenant_id, timeline_id, env.safekeepers, env.pageserver
     )
 
-    detail_before = client.timeline_detail(
-        tenant_id, timeline_id, include_non_incremental_physical_size=True
-    )
-    assert (
-        detail_before["current_physical_size_non_incremental"]
-        == detail_before["current_physical_size"]
-    )
-
     env.pageserver.stop()
 
     timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
@@ -200,13 +192,16 @@ def test_tenants_attached_after_download(
     assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}"
 
     ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory
+    # FIXME: just starting the pageserver no longer downloads the
+    # layer files. Do we want to force download, or maybe run some
+    # queries, or is it enough that it starts up without layer files?
     env.pageserver.start()
     client = env.pageserver.http_client()
 
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -218,12 +213,6 @@ def test_tenants_attached_after_download(
         timeline_id
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
-    # Check that the physical size matches after re-downloading
-    detail_after = client.timeline_detail(
-        tenant_id, timeline_id, include_non_incremental_physical_size=True
-    )
-    assert detail_before["current_physical_size"] == detail_after["current_physical_size"]
-
     # Check that we had to retry the downloads
     assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
 
@@ -297,7 +286,7 @@ def test_tenant_upgrades_index_json_from_v0(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
     )
 
     pg = env.postgres.create_start("main")
@@ -404,7 +393,7 @@ def test_tenant_ignores_backup_file(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
     )
 
     pg = env.postgres.create_start("main")
@@ -484,14 +473,15 @@ def test_tenant_redownloads_truncated_file_on_startup(
     index_part = local_fs_index_part(env, tenant_id, timeline_id)
     assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
 
-    ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory
+    ## Start the pageserver. It will notice that the file size doesn't match, and
+    ## rename away the local file. It will be re-downloaded when it's needed.
     env.pageserver.start()
     client = env.pageserver.http_client()
 
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -503,6 +493,10 @@ def test_tenant_redownloads_truncated_file_on_startup(
         timeline_id
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
+    # Request non-incremental logical size. Calculating it needs the layer file that
+    # we corrupted, forcing it to be redownloaded.
+    client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True)
+
     assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
 
     # the remote side of local_layer_truncated
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 523c946a68..3b41cc5c90 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,10 +20,12 @@ from fixtures.neon_fixtures import (
     PortDistributor,
     Postgres,
     VanillaPostgres,
+    assert_tenant_status,
     wait_for_last_flush_lsn,
+    wait_until,
 )
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import get_timeline_dir_size, wait_until
+from fixtures.utils import get_timeline_dir_size
 
 
 def test_timeline_size(neon_simple_env: NeonEnv):
@@ -320,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     env.pageserver.stop()
     env.pageserver.start()
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    # Wait for the tenant to be loaded
+    client = env.pageserver.http_client()
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"),
+    )
+
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
@@ -341,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
     wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -376,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
@@ -415,7 +431,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 # The timeline logical and physical sizes are also exposed as prometheus metrics.
@@ -448,7 +466,7 @@ def test_timeline_size_metrics(
     # get the metrics and parse the metric for the current timeline's physical size
     metrics = env.pageserver.http_client().get_metrics()
     matches = re.search(
-        f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
+        f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
         metrics,
         re.MULTILINE,
     )
@@ -507,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
 
     tenant, timeline = env.neon_cli.create_tenant()
 
-    def get_timeline_physical_size(timeline: TimelineId):
-        res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True)
-        return res["current_physical_size_non_incremental"]
+    def get_timeline_resident_physical_size(timeline: TimelineId):
+        sizes = get_physical_size_values(env, tenant, timeline)
+        assert_physical_size_invariants(sizes)
+        return sizes.prometheus_resident_physical
 
-    timeline_total_size = get_timeline_physical_size(timeline)
+    timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
     for i in range(10):
         n_rows = random.randint(100, 1000)
 
@@ -528,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
         wait_for_last_flush_lsn(env, pg, tenant, timeline)
         pageserver_http.timeline_checkpoint(tenant, timeline)
 
-        timeline_total_size += get_timeline_physical_size(timeline)
+        timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
 
         pg.stop()
 
-    tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"])
-    assert tenant_physical_size == timeline_total_size
+    # ensure that tenant_status current_physical size reports sum of timeline current_physical_size
+    tenant_current_physical_size = int(
+        client.tenant_status(tenant_id=tenant)["current_physical_size"]
+    )
+    assert tenant_current_physical_size == sum(
+        [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)]
+    )
+    # since we don't do layer eviction, current_physical_size is identical to resident physical size
+    assert timeline_total_resident_physical_size == tenant_current_physical_size
 
 
-def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
-    """Check the current physical size returned from timeline API
-    matches the total physical size of the timeline on disk"""
+class TimelinePhysicalSizeValues:
+    api_current_physical: int
+    prometheus_resident_physical: int
+    python_timelinedir_layerfiles_physical: int
+
+
+def get_physical_size_values(
+    env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId
+) -> TimelinePhysicalSizeValues:
+    res = TimelinePhysicalSizeValues()
+
     client = env.pageserver.http_client()
-    res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True)
+
+    res.prometheus_resident_physical = client.get_timeline_metric(
+        tenant_id, timeline_id, "pageserver_resident_physical_size"
+    )
+
+    detail = client.timeline_detail(
+        tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
+    )
+    res.api_current_physical = detail["current_physical_size"]
+
     timeline_path = env.timeline_dir(tenant_id, timeline_id)
-    assert res["current_physical_size"] == res["current_physical_size_non_incremental"]
-    assert res["current_physical_size"] == get_timeline_dir_size(timeline_path)
+    res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path)
+
+    return res
+
+
+def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
+    # resident phyiscal size is defined as
+    assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
+    # we don't do layer eviction, so, all layers are resident
+    assert sizes.api_current_physical == sizes.prometheus_resident_physical
 
 
 # Timeline logical size initialization is an asynchronous background task that runs once,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d88ed319b5..77ec33f8b0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
         if elapsed > wait_lsn_timeout:
             raise RuntimeError("Timed out waiting for WAL redo")
 
-        pageserver_lsn = Lsn(
-            env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"]
-        )
-        lag = last_lsn - pageserver_lsn
+        tenant_status = ps_cli.tenant_status(tenant_id)
+        if tenant_status["state"] == "Loading":
+            log.debug(f"Tenant {tenant_id} is still loading, retrying")
+        else:
+            pageserver_lsn = Lsn(
+                env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[
+                    "last_record_lsn"
+                ]
+            )
+            lag = last_lsn - pageserver_lsn
 
-        if time.time() > last_debug_print + 10 or lag <= 0:
-            last_debug_print = time.time()
-            log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb")
+            if time.time() > last_debug_print + 10 or lag <= 0:
+                last_debug_print = time.time()
+                log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb")
 
-        if lag <= 0:
-            break
+                if lag <= 0:
+                    break
 
         time.sleep(1)
 

From f5f1197e15cc68bbf47ef91653b50b60d99ec7eb Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 11:25:56 +0100
Subject: [PATCH 133/167] Build vm-compute-node images (#3174)

---
 .github/workflows/build_and_test.yml | 64 ++++++++++++++++++++--------
 Dockerfile.compute-node-v14          |  3 --
 Dockerfile.compute-node-v15          |  3 --
 3 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 43b855a2b0..6443a56afc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -555,10 +555,14 @@ jobs:
       - name: Kaniko build compute tools
         run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
 
-  compute-node-image-v14:
+  compute-node-image:
     runs-on: [ self-hosted, dev, x64 ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
     needs: [ tag ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
     defaults:
       run:
         shell: sh -eu {0}
@@ -573,32 +577,40 @@ jobs:
       - name: Configure ECR login
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
-      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
+      - name: Kaniko build compute node with extensions
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-  compute-node-image-v15:
+  vm-compute-node-image:
     runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
+    needs: [ tag, compute-node-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
     defaults:
       run:
         shell: sh -eu {0}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
+      - name: Downloading latest vm-builder
+        run: |
+          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          chmod +x vm-builder
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Pulling compute-node image
+        run: |
+          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
+      - name: Build vm image
+        run: |
+          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Pushing vm-compute-node image
+        run: |
+          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
   test-images:
-    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
     runs-on: [ self-hosted, dev, x64 ]
 
     steps:
@@ -642,13 +654,13 @@ jobs:
 
   promote-images:
     runs-on: [ self-hosted, dev, x64 ]
-    needs: [ tag, test-images ]
+    needs: [ tag, test-images, vm-compute-node-image ]
     if: github.event_name != 'workflow_dispatch'
     container: amazon/aws-cli
     strategy:
       fail-fast: false
       matrix:
-        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
       - name: Promote image to latest
@@ -681,9 +693,15 @@ jobs:
       - name: Pull compute node v14 image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
 
+      - name: Pull vm compute node v14 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+
       - name: Pull compute node v15 image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
 
+      - name: Pull vm compute node v15 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
       - name: Pull rust image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
 
@@ -695,7 +713,9 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
 
       - name: Configure Docker Hub login
         run: |
@@ -712,9 +732,15 @@ jobs:
       - name: Push compute node v14 image to Docker Hub
         run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
 
+      - name: Push vm compute node v14 image to Docker Hub
+        run: crane push compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+
       - name: Push compute node v15 image to Docker Hub
         run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
 
+      - name: Push vm compute node v15 image to Docker Hub
+        run: crane push compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+
       - name: Push rust image to Docker Hub
         run: crane push rust neondatabase/rust:pinned
 
@@ -726,7 +752,9 @@ jobs:
           crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
   calculate-deploy-targets:
     runs-on: [ self-hosted, dev, x64 ]
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index ad036338a0..1ffabafd51 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
 
-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index 4526644421..11cefcc2da 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
 
-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.

From fca25edae8aea52c764093d0f7677c11f66a7609 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 22 Dec 2022 14:27:48 +0200
Subject: [PATCH 134/167] Fix 1.66 Clippy warnings (#3178)

1.66 release speeds up compile times for over 10% according to tests.

Also its Clippy finds plenty of old nits in our code:
* useless conversion, `foo as u8` where `foo: u8` and similar, removed
`as u8` and similar
* useless references and dereferenced (that were automatically adjusted
by the compiler), removed various `&` and `*`
* bool -> u8 conversion via `if/else`, changed to `u8::from`
* Map `.iter()` calls where only values were used, changed to
`.values()` instead

Standing out lints:
* `Eq` is missing in our protoc generated structs. Silenced, does not
seem crucial for us.
* `fn default` looks like the one from `Default` trait, so I've
implemented that instead and replaced the `dummy_*` method in tests with
`::default()` invocation
* Clippy detected that
```
if retry_attempt < u32::MAX {
    retry_attempt += 1;
}
```
is a saturating add and proposed to replace it.
---
 compute_tools/src/compute.rs                  |  4 +--
 control_plane/src/bin/neon_local.rs           |  4 +--
 control_plane/src/broker.rs                   |  2 +-
 control_plane/src/compute.rs                  | 14 ++++----
 control_plane/src/local_env.rs                | 10 +++---
 control_plane/src/pageserver.rs               |  2 +-
 libs/pageserver_api/src/models.rs             |  8 ++---
 libs/postgres_ffi/src/nonrelfile_utils.rs     | 10 +++---
 libs/postgres_ffi/src/xlog_utils.rs           |  4 +--
 libs/postgres_ffi/wal_craft/src/lib.rs        |  8 ++---
 libs/pq_proto/src/lib.rs                      |  2 +-
 libs/utils/src/crashsafe.rs                   | 12 +++----
 libs/utils/src/sock_split.rs                  |  2 +-
 pageserver/benches/bench_walredo.rs           |  2 +-
 pageserver/src/basebackup.rs                  |  2 +-
 pageserver/src/bin/pageserver.rs              |  4 +--
 pageserver/src/bin/pageserver_binutils.rs     |  6 ++--
 pageserver/src/config.rs                      |  2 +-
 pageserver/src/import_datadir.rs              |  2 +-
 pageserver/src/page_service.rs                |  6 ++--
 pageserver/src/pgdatadir_mapping.rs           | 10 +++---
 pageserver/src/storage_sync2.rs               |  4 +--
 pageserver/src/tenant.rs                      | 24 +++++++-------
 pageserver/src/tenant/disk_btree.rs           | 16 +++++-----
 pageserver/src/tenant/ephemeral_file.rs       |  2 +-
 pageserver/src/tenant/image_layer.rs          |  2 +-
 pageserver/src/tenant/metadata.rs             |  3 +-
 pageserver/src/tenant/timeline.rs             | 10 +++---
 pageserver/src/tenant_config.rs               | 32 ++-----------------
 pageserver/src/walingest.rs                   | 22 ++++++-------
 .../src/walreceiver/connection_manager.rs     |  2 +-
 pageserver/src/walredo.rs                     |  6 ++--
 proxy/src/scram/secret.rs                     |  4 +--
 safekeeper/src/control_file.rs                |  4 +--
 safekeeper/src/metrics.rs                     |  2 +-
 safekeeper/src/wal_backup.rs                  |  6 ++--
 safekeeper/src/wal_storage.rs                 | 14 ++++----
 storage_broker/benches/rps.rs                 |  2 +-
 storage_broker/src/lib.rs                     |  4 +++
 39 files changed, 123 insertions(+), 152 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index bfdd2340ec..eceff0fc4e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -175,7 +175,7 @@ impl ComputeNode {
         let start_time = Utc::now();
 
         let sync_handle = Command::new(&self.pgbin)
-            .args(&["--sync-safekeepers"])
+            .args(["--sync-safekeepers"])
             .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
             .stdout(Stdio::piped())
             .spawn()
@@ -253,7 +253,7 @@ impl ComputeNode {
 
         // Run postgres as a child process.
         let mut pg = Command::new(&self.pgbin)
-            .args(&["-D", &self.pgdata])
+            .args(["-D", &self.pgdata])
             .spawn()
             .expect("cannot start postgres process");
 
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 53fd3100c7..71de741640 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -549,7 +549,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
 
             table.load_preset(comfy_table::presets::NOTHING);
 
-            table.set_header(&[
+            table.set_header([
                 "NODE",
                 "ADDRESS",
                 "TIMELINE",
@@ -584,7 +584,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                     .map(|name| name.as_str())
                     .unwrap_or("?");
 
-                table.add_row(&[
+                table.add_row([
                     node_name.as_str(),
                     &node.address.to_string(),
                     &node.timeline_id.to_string(),
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index bd60580012..6c0604a076 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -17,7 +17,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
         "storage_broker",
         &env.base_data_dir,
         &env.storage_broker_bin(),
-        &args,
+        args,
         [],
         background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
         || {
diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs
index 0eec25c51e..547aa14d39 100644
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -44,7 +44,7 @@ impl ComputeControlPlane {
         let mut nodes = BTreeMap::default();
         let pgdatadirspath = &env.pg_data_dirs_path();
 
-        for tenant_dir in fs::read_dir(&pgdatadirspath)
+        for tenant_dir in fs::read_dir(pgdatadirspath)
             .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
         {
             let tenant_dir = tenant_dir?;
@@ -67,8 +67,8 @@ impl ComputeControlPlane {
     fn get_port(&mut self) -> u16 {
         1 + self
             .nodes
-            .iter()
-            .map(|(_name, node)| node.address.port())
+            .values()
+            .map(|node| node.address.port())
             .max()
             .unwrap_or(self.base_port)
     }
@@ -183,7 +183,7 @@ impl PostgresNode {
 
     fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
         let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(&pg_path);
+        let mut cmd = Command::new(pg_path);
 
         cmd.arg("--sync-safekeepers")
             .env_clear()
@@ -261,7 +261,7 @@ impl PostgresNode {
     }
 
     fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(&self.pgdata()).with_context(|| {
+        fs::create_dir_all(self.pgdata()).with_context(|| {
             format!(
                 "could not create data directory {}",
                 self.pgdata().display()
@@ -478,7 +478,7 @@ impl PostgresNode {
                 postgresql_conf_path.to_str().unwrap()
             )
         })?;
-        fs::remove_dir_all(&self.pgdata())?;
+        fs::remove_dir_all(self.pgdata())?;
         self.create_pgdata()?;
 
         // 2. Bring back config files
@@ -514,7 +514,7 @@ impl PostgresNode {
                 "Destroying postgres data directory '{}'",
                 self.pgdata().to_str().unwrap()
             );
-            fs::remove_dir_all(&self.pgdata())?;
+            fs::remove_dir_all(self.pgdata())?;
         } else {
             self.pg_ctl(&["stop"], &None)?;
         }
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index ed9e467eee..ea936640ec 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -404,7 +404,7 @@ impl LocalEnv {
             }
         }
 
-        fs::create_dir(&base_path)?;
+        fs::create_dir(base_path)?;
 
         // generate keys for jwt
         // openssl genrsa -out private_key.pem 2048
@@ -413,7 +413,7 @@ impl LocalEnv {
             private_key_path = base_path.join("auth_private_key.pem");
             let keygen_output = Command::new("openssl")
                 .arg("genrsa")
-                .args(&["-out", private_key_path.to_str().unwrap()])
+                .args(["-out", private_key_path.to_str().unwrap()])
                 .arg("2048")
                 .stdout(Stdio::null())
                 .output()
@@ -430,10 +430,10 @@ impl LocalEnv {
             // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
             let keygen_output = Command::new("openssl")
                 .arg("rsa")
-                .args(&["-in", private_key_path.to_str().unwrap()])
+                .args(["-in", private_key_path.to_str().unwrap()])
                 .arg("-pubout")
-                .args(&["-outform", "PEM"])
-                .args(&["-out", public_key_path.to_str().unwrap()])
+                .args(["-outform", "PEM"])
+                .args(["-out", public_key_path.to_str().unwrap()])
                 .stdout(Stdio::null())
                 .output()
                 .context("failed to generate auth private key")?;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3575e75db9..0c2415965a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -241,7 +241,7 @@ impl PageServerNode {
         let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
         args.push(Cow::Borrowed("--init"));
 
-        let init_output = Command::new(&self.env.pageserver_bin())
+        let init_output = Command::new(self.env.pageserver_bin())
             .args(args.iter().map(Cow::as_ref))
             .envs(self.pageserver_env_variables()?)
             .output()
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 88603d9539..d954e5d21f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -323,7 +323,7 @@ impl PagestreamFeMessage {
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -333,7 +333,7 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -343,7 +343,7 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -354,7 +354,7 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.dbnode);
             }
diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs
index 01e5554b8a..5acf90be70 100644
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
         status
     );
 
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
 
     let bshift: u8 =
         ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
@@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
 }
 
 pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
 
     let bshift: u8 =
         ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
 
-    ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
+    (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK
 }
 
 // See CLOGPagePrecedes in clog.c
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 953723a8f0..272c4d6dcc 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -333,7 +333,7 @@ impl CheckPoint {
 // We need this segment to start compute node.
 //
 pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
-    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize);
+    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
 
     let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
     let hdr = XLogLongPageHeaderData {
@@ -574,7 +574,7 @@ mod tests {
 
         // Rename file to partial to actually find last valid lsn, then rename it back.
         fs::rename(
-            cfg.wal_dir().join(&last_segment),
+            cfg.wal_dir().join(last_segment),
             cfg.wal_dir().join(format!("{}.partial", last_segment)),
         )
         .unwrap();
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index feec3b2ace..969befc8e7 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -81,7 +81,7 @@ impl Conf {
             .new_pg_command("initdb")?
             .arg("-D")
             .arg(self.datadir.as_os_str())
-            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
+            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
             .output()?;
         debug!("initdb output: {:?}", output);
         ensure!(
@@ -105,12 +105,12 @@ impl Conf {
         let unix_socket_dir_path = unix_socket_dir.path().to_owned();
         let server_process = self
             .new_pg_command("postgres")?
-            .args(&["-c", "listen_addresses="])
+            .args(["-c", "listen_addresses="])
             .arg("-k")
             .arg(unix_socket_dir_path.as_os_str())
             .arg("-D")
             .arg(self.datadir.as_os_str())
-            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
             .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
             .stderr(Stdio::from(log_file))
             .spawn()?;
@@ -142,7 +142,7 @@ impl Conf {
         );
         let output = self
             .new_pg_command("pg_waldump")?
-            .args(&[
+            .args([
                 &first_segment_file.as_os_str(),
                 &last_segment_file.as_os_str(),
             ])
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 0d698127b9..278f044c15 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -881,7 +881,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_u8(b'k');
                     buf.put_u64(req.sent_ptr);
                     buf.put_i64(req.timestamp);
-                    buf.put_u8(if req.request_reply { 1 } else { 0 });
+                    buf.put_u8(u8::from(req.request_reply));
                 });
             }
         }
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 3726779cb2..2c7e6e20ab 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -157,34 +157,34 @@ mod tests {
         assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
 
         let invalid_dir_path = file_path.join("folder");
-        create_dir_all(&invalid_dir_path).unwrap_err();
+        create_dir_all(invalid_dir_path).unwrap_err();
     }
 
     #[test]
     fn test_path_with_suffix_extension() {
         let p = PathBuf::from("/foo/bar");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp").to_string_lossy(),
             "/foo/bar.temp"
         );
         let p = PathBuf::from("/foo/bar");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
             "/foo/bar.temp.temp"
         );
         let p = PathBuf::from("/foo/bar.baz");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
             "/foo/bar.baz.temp.temp"
         );
         let p = PathBuf::from("/foo/bar.baz");
         assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
             "/foo/bar.baz..temp"
         );
         let p = PathBuf::from("/foo/bar/dir/");
         assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
             "/foo/bar/dir..temp"
         );
     }
diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs
index 5e4598daf1..b0e5a0bf6a 100644
--- a/libs/utils/src/sock_split.rs
+++ b/libs/utils/src/sock_split.rs
@@ -50,7 +50,7 @@ impl BufStream {
 
     /// Returns a reference to the underlying TcpStream.
     fn get_ref(&self) -> &TcpStream {
-        &*self.0.get_ref().0
+        &self.0.get_ref().0
     }
 }
 
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 8f53fce027..61011c9f36 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters(
 
                             barrier.wait();
 
-                            execute_all(input, &*manager).unwrap();
+                            execute_all(input, &manager).unwrap();
 
                             barrier.wait();
                         }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index aa87865a8a..36664e119e 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -131,7 +131,7 @@ where
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
-            let header = new_tar_header_dir(*dir)?;
+            let header = new_tar_header_dir(dir)?;
             self.ar.append(&header, &mut io::empty())?;
         }
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index cc403ec2ea..e72a861be0 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,7 +126,7 @@ fn initialize_config(
             );
         }
         // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
             format!(
                 "Failed to read pageserver config at '{}'",
                 cfg_file_path.display()
@@ -180,7 +180,7 @@ fn initialize_config(
     if update_config {
         info!("Writing pageserver config to '{}'", cfg_file_path.display());
 
-        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
+        std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
             format!(
                 "Failed to write pageserver config to '{}'",
                 cfg_file_path.display()
diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs
index b1484ac45a..9da173c873 100644
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
 }
 
 fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
     println!("{control_file:?}");
     let control_file_initdb = Lsn(control_file.checkPoint);
     println!(
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
 }
 
 fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(&path)?;
+    let metadata_bytes = std::fs::read(path)?;
     let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
     println!("Current metadata:\n{meta:?}");
     let mut update_meta = false;
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
 
     if update_meta {
         let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(&path, &metadata_bytes)?;
+        std::fs::write(path, metadata_bytes)?;
     }
 
     Ok(())
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c6f417390f..9334f88a7e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -722,7 +722,7 @@ impl PageServerConf {
             auth_validation_public_key_path: None,
             remote_storage_config: None,
             profiling: ProfilingConfig::Disabled,
-            default_tenant_conf: TenantConf::dummy_conf(),
+            default_tenant_conf: TenantConf::default(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 1684ca3c64..76ca183c9a 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -267,7 +267,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
         }
 
         let nread = file.read_to_end(&mut buf)?;
-        if nread != WAL_SEGMENT_SIZE - offset as usize {
+        if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
             error!("read only {} bytes from WAL file", nread);
         }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fd4353a421..9b52fdaf68 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -444,9 +444,7 @@ impl PageServerHandler {
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| {
-            import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
-        })?;
+        tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -658,7 +656,7 @@ impl PageServerHandler {
         tokio::task::block_in_place(|| {
             let basebackup =
                 basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
-            tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
+            tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
             basebackup.send_tarball()
         })?;
         pgb.write_message(&BeMessage::CopyDone)?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 77910bceda..793dddef01 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -710,14 +710,14 @@ impl<'a> DatadirModification<'a> {
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
-        if r == None || r == Some(false) {
+        if r.is_none() || r == Some(false) {
             // The dbdir entry didn't exist, or it contained a
             // 'false'. The 'insert' call already updated it with
             // 'true', now write the updated 'dbdirs' map back.
             let buf = DbDirectory::ser(&dbdir)?;
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         }
-        if r == None {
+        if r.is_none() {
             // Create RelDirectory
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
@@ -1095,9 +1095,7 @@ impl<'a> DatadirModification<'a> {
                 // work directly with Images, and we never need to read actual
                 // data pages. We could handle this if we had to, by calling
                 // the walredo manager, but let's keep it simple for now.
-                return PageReconstructResult::from(anyhow::anyhow!(
-                    "unexpected pending WAL record"
-                ));
+                PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1425,7 +1423,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
         field2: 0,
         field3: 0,
         field4: 0,
-        field5: if overflowed { 1 } else { 0 },
+        field5: u8::from(overflowed),
         field6: next_xid,
     }
 }
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index a2337e8fd6..6883c11473 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -519,9 +519,9 @@ impl RemoteTimelineClient {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
                 .layer_metadata
-                .iter()
+                .values()
                 // If we don't have the file size for the layer, don't account for it in the metric.
-                .map(|(_, ilmd)| ilmd.file_size.unwrap_or(0))
+                .map(|ilmd| ilmd.file_size.unwrap_or(0))
                 .sum()
         } else {
             0
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1240a3b4fb..4129c205ad 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -337,7 +337,7 @@ impl TimelineUninitMark {
         let uninit_mark_parent = uninit_mark_file
             .parent()
             .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| {
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
             format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
         })?;
         crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
@@ -2321,12 +2321,12 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
     }
 }
 
 fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
-    fs::remove_dir_all(&timeline_dir)
+    fs::remove_dir_all(timeline_dir)
         .or_else(|e| {
             if e.kind() == std::io::ErrorKind::NotFound {
                 // we can leave the uninit mark without a timeline dir,
@@ -2342,7 +2342,7 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
                 timeline_dir.display()
             )
         })?;
-    fs::remove_file(&uninit_mark).with_context(|| {
+    fs::remove_file(uninit_mark).with_context(|| {
         format!(
             "Failed to remove timeline uninit mark file {}",
             uninit_mark.display()
@@ -2442,7 +2442,7 @@ fn try_create_target_tenant_dir(
         anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
     });
 
-    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
+    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
             "failed to move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
@@ -2496,9 +2496,9 @@ fn run_initdb(
     );
 
     let initdb_output = Command::new(&initdb_bin_path)
-        .args(&["-D", &initdb_target_dir.to_string_lossy()])
-        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
+        .args(["-D", &initdb_target_dir.to_string_lossy()])
+        .args(["-U", &conf.superuser])
+        .args(["-E", "utf8"])
         .arg("--no-instructions")
         // This is only used for a temporary installation that is deleted shortly after,
         // so no need to fsync it
@@ -2660,9 +2660,11 @@ pub mod harness {
 
             // Disable automatic GC and compaction to make the unit tests more deterministic.
             // The tests perform them manually if needed.
-            let mut tenant_conf = TenantConf::dummy_conf();
-            tenant_conf.gc_period = Duration::ZERO;
-            tenant_conf.compaction_period = Duration::ZERO;
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };
 
             let tenant_id = TenantId::generate();
             fs::create_dir_all(conf.tenant_path(&tenant_id))?;
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 33255dbd82..88dff32b76 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
         off += keys_len as u64;
 
         let values_off = off as usize;
-        let values_len = num_children as usize * VALUE_SZ as usize;
+        let values_len = num_children as usize * VALUE_SZ;
         //off += values_len as u64;
 
         let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
         while low < high {
             let mid = low + size / 2;
 
-            let key_off = mid as usize * self.suffix_len as usize;
+            let key_off = mid * self.suffix_len as usize;
             let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
             // Does this match?
             keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
@@ -328,7 +328,7 @@ where
             while idx < node.num_children as usize {
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                 #[allow(clippy::collapsible_if)]
                 if node.level == 0 {
                     // leaf
@@ -368,7 +368,7 @@ where
                 key_off -= suffix_len;
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                 #[allow(clippy::collapsible_if)]
                 if node.level == 0 {
                     // leaf
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
         self.keys.extend(&key[self.prefix.len()..]);
         self.values.extend(value.0);
 
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
 
         self.size += self.suffix_len + VALUE_SZ;
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
         self.size -= prefix_len * self.num_children as usize;
         self.size += prefix_len;
 
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
 
         true
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
     /// Serialize the node to on-disk format.
     ///
     fn pack(&self) -> Bytes {
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
         assert!(self.num_children > 0);
 
@@ -940,7 +940,7 @@ mod tests {
             let t = -(f64::ln(u));
             let key_int = (t * 1000000.0) as u128;
 
-            all_data.insert(key_int as u128, idx as u64);
+            all_data.insert(key_int, idx as u64);
         }
 
         // Build a tree from it
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 0774fa42a6..c433e65ad2 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -91,7 +91,7 @@ impl EphemeralFile {
                 break;
             }
 
-            off += n as usize;
+            off += n;
         }
         Ok(())
     }
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index 1e129fc01d..4b43328f35 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -569,7 +569,7 @@ impl ImageLayerWriterInner {
                 lsn: self.lsn,
             },
         );
-        std::fs::rename(self.path, &final_path)?;
+        std::fs::rename(self.path, final_path)?;
 
         trace!("created image layer {}", layer.path().display());
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index f3a0a5171a..297cccbe30 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -255,8 +255,7 @@ pub fn save_metadata(
     // fsync the parent directory to ensure the directory entry is durable
     if first_save {
         let timeline_dir = File::open(
-            &path
-                .parent()
+            path.parent()
                 .expect("Metadata should always have a parent dir"),
         )?;
         timeline_dir.sync_all()?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f4288fea36..25a9e1ec51 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1327,10 +1327,8 @@ impl Timeline {
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
-                let local_only_filenames = self
-                    .create_remote_layers(index_part, local_layers, disk_consistent_lsn)
-                    .await?;
-                local_only_filenames
+                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
+                    .await?
             }
             None => {
                 info!("initializing upload queue as empty");
@@ -3425,9 +3423,9 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
     let mut new_path = path.to_owned();
 
     for i in 0u32.. {
-        new_path.set_file_name(format!("{}.{}.old", filename, i));
+        new_path.set_file_name(format!("{filename}.{i}.old"));
         if !new_path.exists() {
-            std::fs::rename(&path, &new_path)?;
+            std::fs::rename(path, &new_path)?;
             return Ok(());
         }
     }
diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs
index 1204d1abd8..8569c70217 100644
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -191,11 +191,10 @@ impl TenantConfOpt {
     }
 }
 
-impl TenantConf {
-    pub fn default() -> TenantConf {
+impl Default for TenantConf {
+    fn default() -> Self {
         use defaults::*;
-
-        TenantConf {
+        Self {
             checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
             checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
                 .expect("cannot parse default checkpoint timeout"),
@@ -220,29 +219,4 @@ impl TenantConf {
             trace_read_requests: false,
         }
     }
-
-    pub fn dummy_conf() -> Self {
-        TenantConf {
-            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: Duration::from_secs(600),
-            compaction_target_size: 4 * 1024 * 1024,
-            compaction_period: Duration::from_secs(10),
-            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
-            gc_horizon: defaults::DEFAULT_GC_HORIZON,
-            gc_period: Duration::from_secs(10),
-            image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: Duration::from_secs(60 * 60),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .unwrap(),
-            lagging_wal_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
-            )
-            .unwrap(),
-            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .unwrap(),
-            trace_read_requests: false,
-        }
-    }
 }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e3453dfe06..26a77c02d4 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -317,7 +317,7 @@ impl<'a> WalIngest<'a> {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
             relnode: blk.rnode_relnode,
-            forknum: blk.forknum as u8,
+            forknum: blk.forknum,
         };
 
         //
@@ -1131,7 +1131,7 @@ mod tests {
     async fn test_relsize() -> Result<()> {
         let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
@@ -1155,7 +1155,7 @@ mod tests {
             .no_ondemand_download()?;
         m.commit()?;
 
-        assert_current_logical_size(&*tline, Lsn(0x50));
+        assert_current_logical_size(&tline, Lsn(0x50));
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
@@ -1239,7 +1239,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x60));
         walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
         m.commit()?;
-        assert_current_logical_size(&*tline, Lsn(0x60));
+        assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
         assert_eq!(
@@ -1347,7 +1347,7 @@ mod tests {
     async fn test_drop_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
@@ -1416,7 +1416,7 @@ mod tests {
     async fn test_truncate_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1554,7 +1554,7 @@ mod tests {
     async fn test_large_rel() -> Result<()> {
         let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1567,7 +1567,7 @@ mod tests {
             m.commit()?;
         }
 
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
             tline
@@ -1587,7 +1587,7 @@ mod tests {
                 .no_ondemand_download()?,
             RELSEG_SIZE
         );
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         // Truncate another block
         lsn += 0x10;
@@ -1600,7 +1600,7 @@ mod tests {
                 .no_ondemand_download()?,
             RELSEG_SIZE - 1
         );
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         // Truncate to 1500, and then truncate all the way down to 0, one block at a time
         // This tests the behavior at segment boundaries
@@ -1619,7 +1619,7 @@ mod tests {
 
             size -= 1;
         }
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         Ok(())
     }
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index aeb7601af7..8b60e59305 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -805,7 +805,7 @@ fn wal_stream_connection_config(
     auth_token: Option<&str>,
 ) -> anyhow::Result<PgConnectionConfig> {
     let (host, port) =
-        parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
     let port = port.unwrap_or(5432);
     Ok(PgConnectionConfig::new_host_port(host, port)
         .extend_options([
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index ca7cfb7413..7cf489562b 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -409,7 +409,7 @@ impl PostgresRedoManager {
                     key
                 );
                 for &xid in xids {
-                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
                     let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
 
@@ -459,7 +459,7 @@ impl PostgresRedoManager {
                     key
                 );
                 for &xid in xids {
-                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
                     let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
 
@@ -647,7 +647,7 @@ impl PostgresRedoProcess {
 
         info!("running initdb in {}", datadir.display());
         let initdb = Command::new(pg_bin_dir_path.join("initdb"))
-            .args(&["-D", &datadir.to_string_lossy()])
+            .args(["-D", &datadir.to_string_lossy()])
             .arg("-N")
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 89668465fa..424beccec9 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -48,7 +48,7 @@ impl ServerSecret {
 
         Self {
             iterations: 4096,
-            salt_base64: base64::encode(&mocked_salt),
+            salt_base64: base64::encode(mocked_salt),
             stored_key: ScramKey::default(),
             server_key: ScramKey::default(),
             doomed: true,
@@ -68,7 +68,7 @@ impl ServerSecret {
 
         Some(Self {
             iterations,
-            salt_base64: base64::encode(&salt),
+            salt_base64: base64::encode(salt),
             stored_key: password.client_key().sha256(),
             server_key: password.server_key(),
             doomed: false,
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index f4a0f8520c..ba5e453e41 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -239,7 +239,7 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
         Ok((
             FileStorage::restore_new(ttid, conf)?,
             FileStorage::load_control_file_conf(conf, ttid)?,
@@ -250,7 +250,7 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
         let state = SafeKeeperState::empty();
         let storage = FileStorage::create_new(ttid, conf, state.clone())?;
         Ok((storage, state))
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index d4d3d37737..b21770686c 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -425,7 +425,7 @@ impl Collector for TimelineCollector {
                 .set(tli.num_computes as i64);
             self.acceptor_term
                 .with_label_values(labels)
-                .set(tli.persisted_state.acceptor_state.term as u64);
+                .set(tli.persisted_state.acceptor_state.term);
             self.written_wal_bytes
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_bytes);
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index ae4d4cce09..fc971ca753 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -346,9 +346,7 @@ impl WalBackupTask {
                         backup_lsn, commit_lsn, e
                     );
 
-                    if retry_attempt < u32::MAX {
-                        retry_attempt += 1;
-                    }
+                    retry_attempt = retry_attempt.saturating_add(1);
                 }
             }
         }
@@ -387,7 +385,7 @@ async fn backup_single_segment(
 ) -> Result<()> {
     let segment_file_path = seg.file_path(timeline_dir)?;
     let remote_segment_path = segment_file_path
-        .strip_prefix(&workspace_dir)
+        .strip_prefix(workspace_dir)
         .context("Failed to strip workspace dir prefix")
         .and_then(RemotePath::new)
         .with_context(|| {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 52368bb719..41457868fe 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -223,7 +223,7 @@ impl PhysicalStorage {
             // Rename partial file to completed file
             let (wal_file_path, wal_file_partial_path) =
                 wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(&wal_file_partial_path, &wal_file_path)?;
+            fs::rename(wal_file_partial_path, wal_file_path)?;
         } else {
             // otherwise, file can be reused later
             self.file = Some(file);
@@ -249,7 +249,7 @@ impl PhysicalStorage {
 
         while !buf.is_empty() {
             // Extract WAL location for this block
-            let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize;
+            let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size);
             let segno = self.write_lsn.segment_number(self.wal_seg_size);
 
             // If crossing a WAL boundary, only write up until we reach wal segment size.
@@ -366,7 +366,7 @@ impl Storage for PhysicalStorage {
             self.fdatasync_file(&mut unflushed_file)?;
         }
 
-        let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = end_pos.segment_offset(self.wal_seg_size);
         let segno = end_pos.segment_number(self.wal_seg_size);
 
         // Remove all segments after the given LSN.
@@ -383,7 +383,7 @@ impl Storage for PhysicalStorage {
             // Make segment partial once again
             let (wal_file_path, wal_file_partial_path) =
                 wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(&wal_file_path, &wal_file_partial_path)?;
+            fs::rename(wal_file_path, wal_file_partial_path)?;
         }
 
         // Update LSNs
@@ -416,7 +416,7 @@ fn remove_segments_from_disk(
     let mut min_removed = u64::MAX;
     let mut max_removed = u64::MIN;
 
-    for entry in fs::read_dir(&timeline_dir)? {
+    for entry in fs::read_dir(timeline_dir)? {
         let entry = entry?;
         let entry_path = entry.path();
         let fname = entry_path.file_name().unwrap();
@@ -499,7 +499,7 @@ impl WalReader {
 
         // How much to read and send in message? We cannot cross the WAL file
         // boundary, and we don't want send more than provided buffer.
-        let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let send_size = min(buf.len(), self.wal_seg_size - xlogoff);
 
         // Read some data from the file.
@@ -518,7 +518,7 @@ impl WalReader {
 
     /// Open WAL segment at the current position of the reader.
     async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead>>> {
-        let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
         let wal_file_path = self.timeline_dir.join(wal_file_name);
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 1262bd9333..f3544a7cb8 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -160,7 +160,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     for _i in 0..args.num_pubs {
         let c = None;
-        tokio::spawn(publish(c, args.num_subs as u64));
+        tokio::spawn(publish(c, args.num_subs));
     }
 
     h.await?;
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index d12a79a69f..8441aaf625 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -13,6 +13,10 @@ use proto::{
 
 // Code generated by protobuf.
 pub mod proto {
+    // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
+    // we don't use these types for anything but broker data transmission,
+    // so it's ok to ignore this one.
+    #![allow(clippy::derive_partial_eq_without_eq)]
     tonic::include_proto!("storage_broker");
 }
 

From 707d1c1c948ded4efee06ab18880c876639f3ce1 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 13:34:16 +0100
Subject: [PATCH 135/167] Fix vm-compute-image upload to dockerhub (#3181)

---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6443a56afc..ff433decf7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -733,13 +733,13 @@ jobs:
         run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
 
       - name: Push vm compute node v14 image to Docker Hub
-        run: crane push compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
 
       - name: Push compute node v15 image to Docker Hub
         run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
 
       - name: Push vm compute node v15 image to Docker Hub
-        run: crane push compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
 
       - name: Push rust image to Docker Hub
         run: crane push rust neondatabase/rust:pinned

From 201fedd65ca281d0e5b4f3d3a8ebef7c1a4108e7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 22 Dec 2022 12:40:39 +0000
Subject: [PATCH 136/167] tpch-compare: use rust image instead of rustlegacy
 (#3182)

---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 07e111b67c..59317f0a47 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -407,7 +407,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     timeout-minutes: 360 # 6h

From 5a762744c7db4356753dcf0efa8e96d72f85d06d Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 21 Dec 2022 12:08:54 +0200
Subject: [PATCH 137/167] Collect core dump backtraces in compute_ctl.

Scan core dumps directory on exit. In case of existing core dumps
call gdb/lldb to get a backtrace and log it. By default look for
core dumps in postgres data directory as core.<pid>. That is how
core collection is configured in our k8s nodes (and a reasonable
convention in general).
---
 Dockerfile.compute-node-v14  |  3 +-
 Dockerfile.compute-node-v15  |  3 +-
 compute_tools/src/compute.rs | 69 +++++++++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index 1ffabafd51..e7fba49bb1 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -204,7 +204,8 @@ RUN apt update &&  \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 USER postgres
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index 11cefcc2da..cd03525b97 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -204,7 +204,8 @@ RUN apt update &&  \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 USER postgres
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index eceff0fc4e..7ebb98077a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,7 +23,7 @@ use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::info;
+use log::{info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
@@ -328,6 +328,9 @@ impl ComputeNode {
             .wait()
             .expect("failed to start waiting on Postgres process");
 
+        self.check_for_core_dumps()
+            .expect("failed to check for core dumps");
+
         Ok(ecode)
     }
 
@@ -343,4 +346,68 @@ impl ComputeNode {
         self.prepare_pgdata()?;
         self.run()
     }
+
+    // Look for core dumps and collect backtraces.
+    //
+    // EKS worker nodes have following core dump settings:
+    //   /proc/sys/kernel/core_pattern -> core
+    //   /proc/sys/kernel/core_uses_pid -> 1
+    //   ulimint -c -> unlimited
+    // which results in core dumps being written to postgres data directory as core.<pid>.
+    //
+    // Use that as a default location and pattern, except macos where core dumps are written
+    // to /cores/ directory by default.
+    fn check_for_core_dumps(&self) -> Result<()> {
+        let core_dump_dir = match std::env::consts::OS {
+            "macos" => Path::new("/cores/"),
+            _ => Path::new(&self.pgdata),
+        };
+
+        // Collect core dump paths if any
+        info!("checking for core dumps in {}", core_dump_dir.display());
+        let files = fs::read_dir(core_dump_dir)?;
+        let cores = files.filter_map(|entry| {
+            let entry = entry.ok()?;
+            let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
+            Some(entry.path())
+        });
+
+        // Print backtrace for each core dump
+        for core_path in cores {
+            warn!(
+                "core dump found: {}, collecting backtrace",
+                core_path.display()
+            );
+
+            // Try first with gdb
+            let backtrace = Command::new("gdb")
+                .args(["--batch", "-q", "-ex", "bt", &self.pgbin])
+                .arg(&core_path)
+                .output();
+
+            // Try lldb if no gdb is found -- that is handy for local testing on macOS
+            let backtrace = match backtrace {
+                Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    warn!("cannot find gdb, trying lldb");
+                    Command::new("lldb")
+                        .arg("-c")
+                        .arg(&core_path)
+                        .args(["--batch", "-o", "bt all", "-o", "quit"])
+                        .output()
+                }
+                _ => backtrace,
+            }?;
+
+            warn!(
+                "core dump backtrace: {}",
+                String::from_utf8_lossy(&backtrace.stdout)
+            );
+            warn!(
+                "debugger stderr: {}",
+                String::from_utf8_lossy(&backtrace.stderr)
+            );
+        }
+
+        Ok(())
+    }
 }

From 9b712159063df0571912436c7c807359a5d221d2 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Thu, 22 Dec 2022 13:49:52 +0100
Subject: [PATCH 138/167] Simplify some functions in compute_tools and fix typo
 errors in func name

---
 compute_tools/src/bin/compute_ctl.rs    |  2 +-
 compute_tools/src/checker.rs            |  2 +-
 compute_tools/src/compute.rs            | 23 +++-----------------
 compute_tools/src/monitor.rs            |  6 ++---
 compute_tools/src/pg_helpers.rs         | 27 ++++++++++-------------
 compute_tools/tests/pg_helpers_tests.rs | 29 +++++++++++++++++++++++++
 6 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 7786d7af9c..f3b787209d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -105,7 +105,7 @@ fn main() -> Result<()> {
         tenant,
         timeline,
         pageserver_connstr,
-        metrics: ComputeMetrics::new(),
+        metrics: ComputeMetrics::default(),
         state: RwLock::new(ComputeState::new()),
     };
     let compute = Arc::new(compute_state);
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index b6ba1692f9..ee1605c814 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -5,7 +5,7 @@ use tokio_postgres::NoTls;
 
 use crate::compute::ComputeNode;
 
-pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
+pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
         id serial primary key,
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 7ebb98077a..c2c9ab2230 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -27,7 +27,7 @@ use log::{info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
-use crate::checker::create_writablity_check_data;
+use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -91,7 +91,7 @@ pub enum ComputeStatus {
     Failed,
 }
 
-#[derive(Serialize)]
+#[derive(Default, Serialize)]
 pub struct ComputeMetrics {
     pub sync_safekeepers_ms: AtomicU64,
     pub basebackup_ms: AtomicU64,
@@ -99,23 +99,6 @@ pub struct ComputeMetrics {
     pub total_startup_ms: AtomicU64,
 }
 
-impl ComputeMetrics {
-    pub fn new() -> Self {
-        Self {
-            sync_safekeepers_ms: AtomicU64::new(0),
-            basebackup_ms: AtomicU64::new(0),
-            config_ms: AtomicU64::new(0),
-            total_startup_ms: AtomicU64::new(0),
-        }
-    }
-}
-
-impl Default for ComputeMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ComputeNode {
     pub fn set_status(&self, status: ComputeStatus) {
         self.state.write().unwrap().status = status;
@@ -292,7 +275,7 @@ impl ComputeNode {
         handle_databases(&self.spec, &mut client)?;
         handle_role_deletions(self, &mut client)?;
         handle_grants(self, &mut client)?;
-        create_writablity_check_data(&mut client)?;
+        create_writability_check_data(&mut client)?;
 
         // 'Close' connection
         drop(client);
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 58cdf796bc..1588f5d62e 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -74,10 +74,8 @@ fn watch_compute_activity(compute: &ComputeNode) {
                         }
                     }
 
-                    // Sort idle backend `state_change` timestamps. The last one corresponds
-                    // to the last activity.
-                    idle_backs.sort();
-                    if let Some(last) = idle_backs.last() {
+                    // Get idle backend `state_change` with the max timestamp.
+                    if let Some(last) = idle_backs.iter().max() {
                         last_active = *last;
                     }
                 }
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 289f223bda..ff422f1cf5 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -119,16 +119,9 @@ pub trait GenericOptionsSearch {
 impl GenericOptionsSearch for GenericOptions {
     /// Lookup option by name
     fn find(&self, name: &str) -> Option<String> {
-        match &self {
-            Some(ops) => {
-                let op = ops.iter().find(|s| s.name == name);
-                match op {
-                    Some(op) => op.value.clone(),
-                    None => None,
-                }
-            }
-            None => None,
-        }
+        let ops = self.as_ref()?;
+        let op = ops.iter().find(|s| s.name == name)?;
+        op.value.clone()
     }
 }
 
@@ -161,6 +154,14 @@ impl Role {
 }
 
 impl Database {
+    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
+        Self {
+            name,
+            owner,
+            options: None,
+        }
+    }
+
     /// Serialize a list of database parameters into a Postgres-acceptable
     /// string of arguments.
     /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
@@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
             &[],
         )?
         .iter()
-        .map(|row| Database {
-            name: row.get("datname"),
-            owner: row.get("owner"),
-            options: None,
-        })
+        .map(|row| Database::new(row.get("datname"), row.get("owner")))
         .collect();
 
     Ok(postgres_dbs)
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index 24cad4663a..431d9794bc 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -38,4 +38,33 @@ mod pg_helpers_tests {
 
         assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
     }
+
+    #[test]
+    fn generic_options_search() {
+        let generic_options: GenericOptions = Some(vec![
+            GenericOption {
+                name: "present_value".into(),
+                value: Some("value".into()),
+                vartype: "string".into(),
+            },
+            GenericOption {
+                name: "missed_value".into(),
+                value: None,
+                vartype: "int".into(),
+            },
+        ]);
+        assert_eq!(generic_options.find("present_value"), Some("value".into()));
+        assert_eq!(generic_options.find("missed_value"), None);
+        assert_eq!(generic_options.find("invalid_value"), None);
+
+        let empty_generic_options: GenericOptions = Some(vec![]);
+        assert_eq!(empty_generic_options.find("present_value"), None);
+        assert_eq!(empty_generic_options.find("missed_value"), None);
+        assert_eq!(empty_generic_options.find("invalid_value"), None);
+
+        let none_generic_options: GenericOptions = None;
+        assert_eq!(none_generic_options.find("present_value"), None);
+        assert_eq!(none_generic_options.find("missed_value"), None);
+        assert_eq!(none_generic_options.find("invalid_value"), None);
+    }
 }

From 63eb87bde35aff98188039bd426879942111446c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 22 Dec 2022 15:47:24 +0200
Subject: [PATCH 139/167] Set default metric_collection_interval to 10 min,
 which is more reasonable for real usage

---
 pageserver/src/config.rs                      | 2 +-
 test_runner/regress/test_metric_collection.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9334f88a7e..66f8a9f4b8 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -56,7 +56,7 @@ pub mod defaults {
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
-    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "60 s";
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
     pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     ///
     /// Default built-in configuration file.
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index fa1bf0fbb2..b171be3ac7 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -69,7 +69,8 @@ def test_metric_collection(
     # to trigger remote storage operations in a controlled way
     neon_env_builder.pageserver_config_override = (
         f"""
-    metric_collection_endpoint="{metric_collection_endpoint}"
+        metric_collection_interval="60s"
+        metric_collection_endpoint="{metric_collection_endpoint}"
     """
         + "tenant_config={pitr_interval = '0 sec'}"
     )

From 8544c5932937788f0187a389fef37d743472aa1c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 21 Dec 2022 18:31:52 +0200
Subject: [PATCH 140/167] Fix flaky test_metrics_collection.py Only check that
 all metrics are present on the first request, because pageserver doesn't send
 unchanged metrics.

---
 test_runner/regress/test_metric_collection.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index b171be3ac7..a3b3609153 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -22,6 +22,7 @@ def httpserver_listen_address(port_distributor: PortDistributor):
 
 num_metrics_received = 0
 remote_uploaded = 0
+first_request = True
 
 
 #
@@ -46,7 +47,12 @@ def metrics_handler(request: Request) -> Response:
     for event in events:
         assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
 
-    assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+    global first_request
+    # check that all checks were sent
+    # but only on the first request, because we don't send non-changed metrics
+    if first_request:
+        assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+        first_request = False
 
     global num_metrics_received
     num_metrics_received += 1

From 5a496d82b0967211a4b2bc51dd3ccf71828dd683 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 15:37:17 +0100
Subject: [PATCH 141/167] Do not deploy storage and proxies to old staging
 (#3180)

We fully migrated out, this nodes will be soon decommissioned
---
 .github/workflows/build_and_test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ff433decf7..b98974c5a1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -767,8 +767,6 @@ jobs:
       - id: set-matrix
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            echo "include=[$STAGING]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT

From 72ab104733ea8ee9b4a59b2ea2ea2669a2437788 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 22 Dec 2022 19:21:53 +0400
Subject: [PATCH 142/167] Move zenith-1-sk-3 to zenith-1-sk-4 (#3164)

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index d22c845966..3122a43801 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -34,5 +34,5 @@ storage:
           console_region_id: aws-us-west-2
         zenith-1-sk-2:
           console_region_id: aws-us-west-2
-        zenith-1-sk-3:
+        zenith-1-sk-4:
           console_region_id: aws-us-west-2

From 7bc17b373e29c33efe7439a53b84757049415b8d Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 16:28:36 +0100
Subject: [PATCH 143/167] Fix calculate-deploy-targets (#3189)

Was broken in https://github.com/neondatabase/neon/pull/3180
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b98974c5a1..48ed800450 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -767,6 +767,7 @@ jobs:
       - id: set-matrix
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "include=[]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT

From c01f92c08141653644501167902bd586139ec9aa Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 20:09:45 +0100
Subject: [PATCH 144/167] Fully remove old staging deploy (#3191)

---
 .github/ansible/staging.hosts.yaml            | 35 ------------
 .../staging.neon-storage-broker.yaml          | 56 ------------------
 .github/helm-values/staging.proxy-scram.yaml  | 57 -------------------
 .github/helm-values/staging.proxy.yaml        | 57 -------------------
 .github/workflows/build_and_test.yml          | 16 +++---
 5 files changed, 7 insertions(+), 214 deletions(-)
 delete mode 100644 .github/ansible/staging.hosts.yaml
 delete mode 100644 .github/helm-values/staging.neon-storage-broker.yaml
 delete mode 100644 .github/helm-values/staging.proxy-scram.yaml
 delete mode 100644 .github/helm-values/staging.proxy.yaml

diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml
deleted file mode 100644
index 79acfd1d2a..0000000000
--- a/.github/ansible/staging.hosts.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-storage:
-  vars:
-    bucket_name: zenith-staging-storage-us-east-1
-    bucket_region: us-east-1
-    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: http://storage-broker.staging.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: us-stage/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-
-  children:
-    pageservers:
-      hosts:
-        zenith-us-stage-ps-2:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-3:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-4:
-          console_region_id: aws-us-east-1
-
-    safekeepers:
-      hosts:
-        zenith-us-stage-sk-4:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-5:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-6:
-          console_region_id: aws-us-east-1
diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml
deleted file mode 100644
index 6b21c286a1..0000000000
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: staging
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
deleted file mode 100644
index 66f9921c9a..0000000000
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
deleted file mode 100644
index a22082e625..0000000000
--- a/.github/helm-values/staging.proxy.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 48ed800450..17c698482c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -759,20 +759,18 @@ jobs:
   calculate-deploy-targets:
     runs-on: [ self-hosted, dev, x64 ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     outputs:
       matrix-include: ${{ steps.set-matrix.outputs.include }}
     steps:
       - id: set-matrix
         run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "include=[]" >> $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
             exit 1
           fi
 
@@ -783,7 +781,7 @@ jobs:
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -827,7 +825,7 @@ jobs:
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: |
       (github.ref_name == 'main') &&
       github.event_name != 'workflow_dispatch'
@@ -939,7 +937,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -982,7 +980,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:

From 0bafb2a6c703f152bdd3a6ff194720d27eff0b4b Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 23 Dec 2022 15:39:59 +0200
Subject: [PATCH 145/167] Do more on-demand downloads where needed (#3194)

The PR aims to fix two missing redownloads in a flacky
test_remote_storage_upload_queue_retries[local_fs]
([example](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3190/release/3759194738/index.html#categories/80f1dcdd7c08252126be7e9f44fe84e6/8a70800f7ab13620/))

1. missing redownload during walreceiver work
```
2022-12-22T16:09:51.509891Z ERROR wal_connection_manager{tenant=fb62b97553e40f949de8bdeab7f93563 timeline=4f153bf6a58fd63832f6ee175638d049}: wal receiver task finished with an error: walreceiver connection handling failure

Caused by:
    Layer needs downloading

Stack backtrace:
   0: pageserver::tenant::timeline::PageReconstructResult<T>::no_ondemand_download
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:467:59
   1: pageserver::walingest::WalIngest::new
             at /__w/neon/neon/pageserver/src/walingest.rs:61:32
   2: pageserver::walreceiver::walreceiver_connection::handle_walreceiver_connection::{{closure}}
             at /__w/neon/neon/pageserver/src/walreceiver/walreceiver_connection.rs:178:25
....
```

That looks sad, but inevitable during the current approach: seems that
we need to wait for old layers to arrive in order to accept new data.

For that, `WalIngest::new` now started to return the
`PageReconstructResult`.
Sync methods from `import_datadir.rs` use `WalIngest::new` too, but both
of them import WAL during timeline creation, so no layers to download
are needed there, ergo the `PageReconstructResult` is converted to
`anyhow::Result` with `no_ondemand_download`.

2. missing redownload during compaction work
```
2022-12-22T16:09:51.090296Z ERROR compaction_loop{tenant_id=fb62b97553e40f949de8bdeab7f93563}:compact_timeline{timeline=4f153bf6a58fd63832f6ee175638d049}: could not compact, repartitioning keyspace failed: Layer needs downloading

Stack backtrace:
   0: pageserver::tenant::timeline::PageReconstructResult<T>::no_ondemand_download
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:467:59
   1: pageserver::pgdatadir_mapping::<impl pageserver::tenant::timeline::Timeline>::collect_keyspace::{{closure}}
             at /__w/neon/neon/pageserver/src/pgdatadir_mapping.rs:506:41
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
      pageserver::tenant::timeline::Timeline::repartition::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:2161:50
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
   2: pageserver::tenant::timeline::Timeline::compact::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:700:14
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
   3: <tracing::instrument::Instrumented<T> as core::future::future::Future>::poll
             at /github/home/.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-0.1.37/src/instrument.rs:272:9
   4: pageserver::tenant::Tenant::compaction_iteration::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant.rs:1232:85
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
      pageserver::tenant_tasks::compaction_loop::{{closure}}::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant_tasks.rs:76:62
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
      pageserver::tenant_tasks::compaction_loop::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant_tasks.rs:91:6
```
---
 pageserver/src/import_datadir.rs                   | 11 ++++++++---
 pageserver/src/pgdatadir_mapping.rs                | 13 ++++++-------
 pageserver/src/walingest.rs                        | 14 ++++++++------
 .../src/walreceiver/walreceiver_connection.rs      |  3 ++-
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 76ca183c9a..588b92c13f 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -237,14 +237,19 @@ fn import_slru<Reader: Read>(
 
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
+fn import_wal(
+    walpath: &Path,
+    tline: &Timeline,
+    startpoint: Lsn,
+    endpoint: Lsn,
+) -> anyhow::Result<()> {
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint)?;
+    let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -362,7 +367,7 @@ pub fn import_wal_from_tar<Reader: Read>(
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn)?;
+    let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 793dddef01..82b1576145 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -8,7 +8,7 @@
 //!
 use super::tenant::PageReconstructResult;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, Timeline};
 use crate::walrecord::NeonWalRecord;
 use crate::{repository::*, try_no_ondemand_download};
 use anyhow::Context;
@@ -503,12 +503,11 @@ impl Timeline {
             result.add_key(relmap_file_key(spcnode, dbnode));
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
-            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)
-                .no_ondemand_download()?
-                .iter()
-                .cloned()
-                .collect();
+            let mut rels: Vec<RelTag> =
+                with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
+                    .await?
+                    .into_iter()
+                    .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 26a77c02d4..031b80a6e0 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -33,10 +33,10 @@ use tracing::*;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructResult;
 use crate::tenant::Timeline;
-use crate::try_no_ondemand_download;
 use crate::try_page_reconstruct_result as try_prr;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use crate::{try_no_ondemand_download, try_page_reconstruct_result};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -55,14 +55,16 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
+    pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint).no_ondemand_download()?;
-        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+        let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint));
+        let checkpoint = try_page_reconstruct_result!(
+            CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes")
+        );
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
-        Ok(WalIngest {
+        PageReconstructResult::Success(WalIngest {
             timeline,
             checkpoint,
             checkpoint_modified: false,
@@ -1122,7 +1124,7 @@ mod tests {
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10))?;
+        let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?;
 
         Ok(walingest)
     }
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index cc318cccc8..a98126e683 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -175,7 +175,8 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
+    let mut walingest =
+        with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
 
     while let Some(replication_message) = {
         select! {

From b77c33ee0644f00ea3fc0af252f11f7454f5c3fb Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 23 Dec 2022 15:40:37 +0200
Subject: [PATCH 146/167] Move tenant-related modules below `tenant` module
 (#3190)

No real code changes besides moving code around and adjusting the
imports.
---
 pageserver/benches/bench_layer_map.rs         |  3 +-
 pageserver/src/bin/pageserver.rs              | 35 +++++++++++++++-
 pageserver/src/lib.rs                         |  2 -
 pageserver/src/tenant.rs                      | 25 ++++-------
 pageserver/src/tenant/layer_map.rs            |  3 +-
 pageserver/src/tenant/storage_layer.rs        | 15 +++++--
 .../tenant/{ => storage_layer}/delta_layer.rs |  4 +-
 .../tenant/{ => storage_layer}/filename.rs    |  0
 .../tenant/{ => storage_layer}/image_layer.rs |  5 +--
 .../{ => storage_layer}/inmemory_layer.rs     |  3 +-
 .../{ => storage_layer}/remote_layer.rs       | 10 ++---
 .../storage_sync.rs}                          | 42 +++----------------
 .../storage_sync}/delete.rs                   |  0
 .../storage_sync}/download.rs                 |  5 +--
 .../storage_sync}/index.rs                    |  2 +-
 .../storage_sync}/upload.rs                   |  6 +--
 pageserver/src/tenant/timeline.rs             | 25 ++++++-----
 test_runner/regress/test_tenant_detach.py     |  2 +-
 18 files changed, 87 insertions(+), 100 deletions(-)
 rename pageserver/src/tenant/{ => storage_layer}/delta_layer.rs (99%)
 rename pageserver/src/tenant/{ => storage_layer}/filename.rs (100%)
 rename pageserver/src/tenant/{ => storage_layer}/image_layer.rs (99%)
 rename pageserver/src/tenant/{ => storage_layer}/inmemory_layer.rs (99%)
 rename pageserver/src/tenant/{ => storage_layer}/remote_layer.rs (94%)
 rename pageserver/src/{storage_sync2.rs => tenant/storage_sync.rs} (97%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/delete.rs (100%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/download.rs (98%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/index.rs (99%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/upload.rs (96%)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index a0c38e1e3a..6a01fdfc6f 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,8 +1,7 @@
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::ValueReconstructState;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
 use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index e72a861be0..d12063f5aa 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,12 +7,13 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
+use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, storage_sync2, task_mgr,
+    http, page_cache, page_service, profiling, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -280,7 +281,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
 
     // Set up remote storage client
-    let remote_storage = storage_sync2::create_remote_storage_client(conf)?;
+    let remote_storage = create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
     BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
@@ -369,6 +370,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     })
 }
 
+fn create_remote_storage_client(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<Option<GenericRemoteStorage>> {
+    let config = if let Some(config) = &conf.remote_storage_config {
+        config
+    } else {
+        // No remote storage configured.
+        return Ok(None);
+    };
+
+    // Create the client
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+
+    // If `test_remote_failures` is non-zero, wrap the client with a
+    // wrapper that simulates failures.
+    if conf.test_remote_failures > 0 {
+        if !cfg!(feature = "testing") {
+            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+        }
+        info!(
+            "Simulating remote failures for first {} attempts of each op",
+            conf.test_remote_failures
+        );
+        remote_storage =
+            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+    }
+
+    Ok(Some(remote_storage))
+}
+
 fn cli() -> Command {
     Command::new("Neon page server")
         .about("Materializes WAL stream to pages and serves them to the postgres")
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index e01eb12b7b..ae815fe421 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,8 +11,6 @@ pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
 pub mod repository;
-pub mod storage_sync2;
-pub use storage_sync2 as storage_sync;
 pub mod task_mgr;
 pub mod tenant;
 pub mod tenant_config;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4129c205ad..308130c799 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,18 +45,19 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::metadata::TimelineMetadata;
+use self::storage_sync::create_remote_timeline_client;
+use self::storage_sync::index::IndexPart;
+use self::storage_sync::RemoteTimelineClient;
 use crate::config::PageServerConf;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
 use crate::repository::GcResult;
-use crate::storage_sync::create_remote_timeline_client;
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::list_remote_timelines;
-use crate::storage_sync::RemoteTimelineClient;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::storage_layer::DeltaLayer;
+use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::tenant_config::TenantConfOpt;
 use crate::virtual_file::VirtualFile;
@@ -74,18 +75,14 @@ use utils::{
 
 mod blob_io;
 pub mod block_io;
-mod delta_layer;
 mod disk_btree;
 pub(crate) mod ephemeral_file;
-pub mod filename;
-mod image_layer;
-mod inmemory_layer;
 pub mod layer_map;
-mod remote_layer;
 
 pub mod metadata;
 mod par_fsync;
 pub mod storage_layer;
+mod storage_sync;
 
 mod timeline;
 
@@ -647,7 +644,7 @@ impl Tenant {
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
         let remote_timelines =
-            list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
+            storage_sync::list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
 
         info!("found {} timelines", remote_timelines.len());
 
@@ -2541,12 +2538,8 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()
     file.read_exact_at(&mut header_buf, 0)?;
 
     match u16::from_be_bytes(header_buf) {
-        crate::IMAGE_FILE_MAGIC => {
-            image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)?
-        }
-        crate::DELTA_FILE_MAGIC => {
-            delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)?
-        }
+        crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?,
+        crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?,
         magic => bail!("unrecognized magic identifier: {:?}", magic),
     }
 
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 0202ccfa6a..f5182926e4 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -12,7 +12,6 @@
 
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::inmemory_layer::InMemoryLayer;
 use crate::tenant::storage_layer::{range_eq, range_overlaps};
 use amplify_num::i256;
 use anyhow::Result;
@@ -27,7 +26,7 @@ use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::Layer;
+use super::storage_layer::{InMemoryLayer, Layer};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 8bfac5df8e..d87a248bdf 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,6 +1,10 @@
-//!
 //! Common traits and structs for layers
-//!
+
+mod delta_layer;
+mod filename;
+mod image_layer;
+mod inmemory_layer;
+mod remote_layer;
 
 use crate::repository::{Key, Value};
 use crate::walrecord::NeonWalRecord;
@@ -15,8 +19,11 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::remote_layer::RemoteLayer;
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf};
+pub use image_layer::{ImageLayer, ImageLayerWriter};
+pub use inmemory_layer::InMemoryLayer;
+pub use remote_layer::RemoteLayer;
 
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
similarity index 99%
rename from pageserver/src/tenant/delta_layer.rs
rename to pageserver/src/tenant/storage_layer/delta_layer.rs
index 5b724b6263..302ba2dc78 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,7 +29,6 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{DeltaFileName, PathOrConf};
 use crate::tenant::storage_layer::{
     PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -54,8 +53,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::{Layer, LayerIter, LayerKeyIter};
+use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf};
 
 ///
 /// Header stored in the beginning of the file
diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
similarity index 100%
rename from pageserver/src/tenant/filename.rs
rename to pageserver/src/tenant/storage_layer/filename.rs
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
similarity index 99%
rename from pageserver/src/tenant/image_layer.rs
rename to pageserver/src/tenant/storage_layer/image_layer.rs
index 4b43328f35..9a26fce73b 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,7 +25,6 @@ use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{ImageFileName, PathOrConf};
 use crate::tenant::storage_layer::{
     PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -51,8 +50,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::{Layer, LayerIter};
+use super::filename::{ImageFileName, LayerFileName, PathOrConf};
+use super::{Layer, LayerIter};
 
 ///
 /// Header stored in the beginning of the file
diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
similarity index 99%
rename from pageserver/src/tenant/inmemory_layer.rs
rename to pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 35b0e98591..93356a9d8c 100644
--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -8,7 +8,6 @@ use crate::config::PageServerConf;
 use crate::repository::{Key, Value};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
-use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
@@ -28,7 +27,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use std::sync::RwLock;
 
-use super::storage_layer::Layer;
+use super::{DeltaLayer, DeltaLayerWriter, Layer};
 
 thread_local! {
     /// A buffer for serializing object during [`InMemoryLayer::put_value`].
diff --git a/pageserver/src/tenant/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
similarity index 94%
rename from pageserver/src/tenant/remote_layer.rs
rename to pageserver/src/tenant/storage_layer/remote_layer.rs
index affe8ca0a8..c2c11d7bff 100644
--- a/pageserver/src/tenant/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -3,11 +3,8 @@
 //!
 use crate::config::PageServerConf;
 use crate::repository::Key;
-use crate::storage_sync::index::LayerFileMetadata;
-use crate::tenant::delta_layer::DeltaLayer;
-use crate::tenant::filename::{DeltaFileName, ImageFileName};
-use crate::tenant::image_layer::ImageLayer;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_sync::index::LayerFileMetadata;
 use anyhow::{bail, Result};
 use std::ops::Range;
 use std::path::PathBuf;
@@ -18,8 +15,9 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::{LayerIter, LayerKeyIter, PersistentLayer};
+use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
+use super::image_layer::ImageLayer;
+use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer};
 
 #[derive(Debug)]
 pub struct RemoteLayer {
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/tenant/storage_sync.rs
similarity index 97%
rename from pageserver/src/storage_sync2.rs
rename to pageserver/src/tenant/storage_sync.rs
index 6883c11473..ef57f91a02 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/tenant/storage_sync.rs
@@ -221,15 +221,12 @@ use tracing::{info_span, Instrument};
 
 use utils::lsn::Lsn;
 
-use self::index::IndexPart;
-
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
 use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
-use crate::tenant::filename::LayerFileName;
+use crate::tenant::storage_sync::index::LayerFileMetadata;
 use crate::{
     config::PageServerConf,
-    storage_sync::index::LayerFileMetadata,
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
@@ -239,6 +236,10 @@ use crate::{
 
 use utils::id::{TenantId, TimelineId};
 
+use self::index::IndexPart;
+
+use super::storage_layer::LayerFileName;
+
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -1178,39 +1179,6 @@ pub fn create_remote_timeline_client(
     })
 }
 
-///
-/// Create GenericRemoteStorage client from the pageserver config
-///
-pub fn create_remote_storage_client(
-    conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
-    let config = if let Some(config) = &conf.remote_storage_config {
-        config
-    } else {
-        // No remote storage configured.
-        return Ok(None);
-    };
-
-    // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
-
-    // If `test_remote_failures` is non-zero, wrap the client with a
-    // wrapper that simulates failures.
-    if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
-        }
-        info!(
-            "Simulating remote failures for first {} attempts of each op",
-            conf.test_remote_failures
-        );
-        remote_storage =
-            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
-    }
-
-    Ok(Some(remote_storage))
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/tenant/storage_sync/delete.rs
similarity index 100%
rename from pageserver/src/storage_sync2/delete.rs
rename to pageserver/src/tenant/storage_sync/delete.rs
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/tenant/storage_sync/download.rs
similarity index 98%
rename from pageserver/src/storage_sync2/download.rs
rename to pageserver/src/tenant/storage_sync/download.rs
index 4256767020..422728d1f3 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/tenant/storage_sync/download.rs
@@ -14,14 +14,13 @@ use tokio::io::AsyncWriteExt;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
 use crate::config::PageServerConf;
-use crate::storage_sync::index::LayerFileMetadata;
-use crate::tenant::filename::LayerFileName;
+use crate::tenant::storage_layer::LayerFileName;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
-use super::index::{IndexPart, IndexPartUnclean};
+use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
 use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/tenant/storage_sync/index.rs
similarity index 99%
rename from pageserver/src/storage_sync2/index.rs
rename to pageserver/src/tenant/storage_sync/index.rs
index bb58a34969..017be29726 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/tenant/storage_sync/index.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tracing::warn;
 
-use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata};
+use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName};
 
 use utils::lsn::Lsn;
 
diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/tenant/storage_sync/upload.rs
similarity index 96%
rename from pageserver/src/storage_sync2/upload.rs
rename to pageserver/src/tenant/storage_sync/upload.rs
index 57a524a22d..08cea6268b 100644
--- a/pageserver/src/storage_sync2/upload.rs
+++ b/pageserver/src/tenant/storage_sync/upload.rs
@@ -5,12 +5,12 @@ use fail::fail_point;
 use std::path::Path;
 use tokio::fs;
 
-use super::index::IndexPart;
-use crate::config::PageServerConf;
-use crate::storage_sync::LayerFileMetadata;
+use crate::{config::PageServerConf, tenant::storage_sync::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
 
+use super::index::LayerFileMetadata;
+
 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
     conf: &'static PageServerConf,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 25a9e1ec51..55ede57e53 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,15 +23,13 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::RemoteTimelineClient;
-use crate::tenant::remote_layer::RemoteLayer;
+use crate::tenant::storage_layer::{
+    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
+    RemoteLayer,
+};
+use crate::tenant::storage_sync::{self, index::LayerFileMetadata};
 use crate::tenant::{
-    delta_layer::{DeltaLayer, DeltaLayerWriter},
     ephemeral_file::is_ephemeral_file,
-    filename::{DeltaFileName, ImageFileName},
-    image_layer::{ImageLayer, ImageLayerWriter},
-    inmemory_layer::InMemoryLayer,
     layer_map::{LayerMap, SearchResult},
     metadata::{save_metadata, TimelineMetadata},
     par_fsync,
@@ -56,6 +54,7 @@ use utils::{
     simple_rcu::{Rcu, RcuReadGuard},
 };
 
+use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
@@ -64,10 +63,10 @@ use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
-use crate::{page_cache, storage_sync::index::LayerFileMetadata};
 
-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::storage_layer::{DeltaLayer, ImageLayer, Layer};
+use super::storage_sync::index::IndexPart;
+use super::storage_sync::RemoteTimelineClient;
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum FlushLoopState {
@@ -97,7 +96,7 @@ pub struct Timeline {
     walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
 
     /// Remote storage client.
-    /// See [`storage_sync2`] module comment for details.
+    /// See [`storage_sync`] module comment for details.
     pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     // What page versions do we hold in the repository? If we get a
@@ -1123,7 +1122,7 @@ impl Timeline {
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
-            } else if crate::storage_sync::is_temp_download_file(&direntry_path) {
+            } else if storage_sync::is_temp_download_file(&direntry_path) {
                 info!(
                     "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
                     fname
@@ -1293,7 +1292,7 @@ impl Timeline {
     /// 3. Schedule upload of local-only layer files (which will then also update the remote
     ///    IndexPart to include the new layer files).
     ///
-    /// Refer to the `storage_sync2` module comment for more context.
+    /// Refer to the `storage_sync` module comment for more context.
     ///
     /// # TODO
     /// May be a bit cleaner to do things based on populated remote client,
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8bf0fb7548..6963a57542 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -470,7 +470,7 @@ def test_ignore_while_attaching(
     pageserver_http.tenant_attach(tenant_id)
     # Run ignore on the task, thereby cancelling the attach.
     # XXX This should take priority over attach, i.e., it should cancel the attach task.
-    # But neither the failpoint, nor the proper storage_sync2 download functions,
+    # But neither the failpoint, nor the proper storage_sync download functions,
     # are sensitive to task_mgr::shutdown.
     # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
     # So, for now, effectively, this ignore here will block until attach task completes.

From 1468c65ffb70dcc072cb341bf6aa6f800bce6840 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 22 Dec 2022 16:32:29 +0200
Subject: [PATCH 147/167] Enable billing metric_collection_endpoint on staging

---
 .github/ansible/staging.eu-west-1.hosts.yaml | 2 ++
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index cfcc3a9ae8..fce450ed39 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -6,6 +6,8 @@ storage:
     broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 78a4582e57..11c7992444 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -6,6 +6,8 @@ storage:
     broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"

From 1137b58b4d476370c36af93f5da46f9ae2562303 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 26 Dec 2022 18:21:41 +0200
Subject: [PATCH 148/167] Fix LayerMap::search to not return delta layer
 preceeding image layer (#3197)

While @bojanserafimov is still working on best replacement of R-Tree in
layer_map.rs there is obvious pitfall in the current `search` method
implementation: is returns delta layer even if there is image layer if
greater LSN. I think that it should be fixed.
---
 pageserver/src/tenant/layer_map.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index f5182926e4..4ff2d4b0d8 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -326,14 +326,16 @@ where
                 latest_delta.replace(Arc::clone(l));
                 break;
             }
-            // this layer's end LSN is smaller than the requested point. If there's
-            // nothing newer, this is what we need to return. Remember this.
-            if let Some(old_candidate) = &latest_delta {
-                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
+                // this layer's end LSN is smaller than the requested point. If there's
+                // nothing newer, this is what we need to return. Remember this.
+                if let Some(old_candidate) = &latest_delta {
+                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+                        latest_delta.replace(Arc::clone(l));
+                    }
+                } else {
                     latest_delta.replace(Arc::clone(l));
                 }
-            } else {
-                latest_delta.replace(Arc::clone(l));
             }
         }
         if let Some(l) = latest_delta {

From 5826e19b56fc86338204c35bd0916143bf425a47 Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Tue, 27 Dec 2022 10:25:19 +0100
Subject: [PATCH 149/167] update the grafana links in the PR release template
 (#3156)

---
 .github/PULL_REQUEST_TEMPLATE/release-pr.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
index 8fcc3bd4af..a848077e6a 100644
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -14,7 +14,7 @@
 - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
 - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
 - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
-- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
-- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
+- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
+- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
 
 <!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

From 140c0edac8a8322efaf88fc15d11977b077869a5 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 27 Dec 2022 14:42:51 +0200
Subject: [PATCH 150/167] Yet another port of local file system cache (#2622)

---
 pgxn/neon/Makefile           |   3 +-
 pgxn/neon/file_cache.c       | 597 +++++++++++++++++++++++++++++++++++
 pgxn/neon/libpagestore.c     |   1 +
 pgxn/neon/neon--1.0.sql      |  10 +
 pgxn/neon/pagestore_client.h |   7 +
 pgxn/neon/pagestore_smgr.c   |  14 +
 6 files changed, 631 insertions(+), 1 deletion(-)
 create mode 100644 pgxn/neon/file_cache.c

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7f4e30a12e..ec377dbb1e 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,11 +4,12 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
+	neon.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
-	neon.o \
 	walproposer.o \
 	walproposer_utils.o
 
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
new file mode 100644
index 0000000000..96c2461e2d
--- /dev/null
+++ b/pgxn/neon/file_cache.c
@@ -0,0 +1,597 @@
+/*
+ *
+ * file_cache.c
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  pgxn/neon/file_cache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/file.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "pagestore_client.h"
+#include "access/parallel.h"
+#include "postmaster/bgworker.h"
+#include "storage/relfilenode.h"
+#include "storage/buf_internals.h"
+#include "storage/latch.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+#include "storage/fd.h"
+#include "storage/pg_shmem.h"
+#include "storage/buf_internals.h"
+
+/*
+ * Local file cache is used to temporary store relations pages in local file system.
+ * All blocks of all relations are stored inside one file and addressed using shared hash map.
+ * Currently LRU eviction policy based on L2 list is used as replacement algorithm.
+ * As far as manipulation of L2-list requires global critical section, we are not using partitioned hash.
+ * Also we are using exclusive lock even for read operation because LRU requires relinking element in L2 list.
+ * If this lock become a bottleneck, we can consider other eviction strategies, for example clock algorithm.
+ *
+ * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
+ * its consistency.
+ */
+
+/* Local file storage allocation chunk.
+ * Should be power of two and not less than 32. Using larger than page chunks can
+ * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
+ *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
+ *    1Mb chunks can reduce hash map size to 320Mb.
+ * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
+ */
+#define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
+#define MB					((uint64)1024*1024)
+
+#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+
+typedef struct FileCacheEntry
+{
+	BufferTag	key;
+	uint32		offset;
+	uint32		access_count;
+	uint32		bitmap[BLOCKS_PER_CHUNK/32];
+	dlist_node	lru_node; /* LRU list node */
+} FileCacheEntry;
+
+typedef struct FileCacheControl
+{
+	uint32 size; /* size of cache file in chunks */
+	dlist_head lru; /* double linked list for LRU replacement algorithm */
+} FileCacheControl;
+
+static HTAB* lfc_hash;
+static int   lfc_desc;
+static LWLockId lfc_lock;
+static int   lfc_max_size;
+static int   lfc_size_limit;
+static char* lfc_path;
+static  FileCacheControl* lfc_ctl;
+static shmem_startup_hook_type prev_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
+#endif
+
+static void
+lfc_shmem_startup(void)
+{
+	bool found;
+	static HASHCTL info;
+
+	if (prev_shmem_startup_hook)
+	{
+		prev_shmem_startup_hook();
+	}
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	if (!found)
+	{
+		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
+		info.keysize = sizeof(BufferTag);
+		info.entrysize = sizeof(FileCacheEntry);
+		lfc_hash = ShmemInitHash("lfc_hash",
+								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
+								 lfc_size+1, lfc_size+1,
+								 &info,
+								 HASH_ELEM | HASH_BLOBS);
+		lfc_ctl->size = 0;
+		dlist_init(&lfc_ctl->lru);
+
+		/* Remove file cache on restart */
+		(void)unlink(lfc_path);
+	}
+	LWLockRelease(AddinShmemInitLock);
+}
+
+static void
+lfc_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
+	RequestNamedLWLockTranche("lfc_lock", 1);
+}
+
+bool
+lfc_check_limit_hook(int *newval, void **extra, GucSource source)
+{
+	if (*newval > lfc_max_size)
+	{
+		elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
+		return false;
+	}
+	return true;
+}
+
+void
+lfc_change_limit_hook(int newval, void *extra)
+{
+	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
+	/*
+	 * Stats collector detach shared memory, so we should not try to access shared memory here.
+	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 */
+	if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
+		return;
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+			return;
+		}
+	}
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
+	{
+		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
+		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		Assert(victim->access_count == 0);
+#ifdef FALLOC_FL_PUNCH_HOLE
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
+			elog(LOG, "Failed to punch hole in file: %m");
+#endif
+		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
+		lfc_ctl->size -= 1;
+	}
+	elog(LOG, "set local file cache limit to %d", new_size);
+	LWLockRelease(lfc_lock);
+}
+
+void
+lfc_init(void)
+{
+	/*
+	 * In order to create our shared memory area, we have to be loaded via
+	 * shared_preload_libraries.
+	 */
+	if (!process_shared_preload_libraries_in_progress)
+		elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
+
+	DefineCustomIntVariable("neon.max_file_cache_size",
+							"Maximal size of Neon local file cache",
+							NULL,
+							&lfc_max_size,
+							0, /* disabled by default */
+							0,
+							INT_MAX,
+							PGC_POSTMASTER,
+							GUC_UNIT_MB,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_size_limit",
+							"Current limit for size of Neon local file cache",
+							NULL,
+							&lfc_size_limit,
+							0, /* disabled by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL,
+							lfc_change_limit_hook,
+							NULL);
+
+	DefineCustomStringVariable("neon.file_cache_path",
+							   "Path to local file cache (can be raw device)",
+							   NULL,
+							   &lfc_path,
+							   "file.cache",
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	if (lfc_max_size == 0)
+		return;
+
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = lfc_shmem_startup;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = lfc_shmem_request;
+#else
+	lfc_shmem_request();
+#endif
+}
+
+/*
+ * Check if page is present in the cache.
+ * Returns true if page is found in local cache.
+ */
+bool
+lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool found;
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return false;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+	found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
+	LWLockRelease(lfc_lock);
+	return found;
+}
+
+/*
+ * Try to read page from local cache.
+ * Returns true if page is found in local cache.
+ * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache.
+ */
+bool
+lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+		 char *buffer)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool result = true;
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return false;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
+	{
+		/* Page is not cached */
+		LWLockRelease(lfc_lock);
+		return false;
+	}
+	/* Unlink entry from LRU list to pin it for the duration of IO operation */
+	if (entry->access_count++ == 0)
+		dlist_delete(&entry->lru_node);
+	LWLockRelease(lfc_lock);
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+			result = false;
+		}
+	}
+
+	if (lfc_desc > 0)
+	{
+		rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+		if (rc != BLCKSZ)
+		{
+			elog(INFO, "Failed to read file cache: %m");
+			lfc_size_limit = 0; /* disable file cache */
+			result = false;
+		}
+	}
+
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	Assert(entry->access_count > 0);
+	if (--entry->access_count == 0)
+		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	LWLockRelease(lfc_lock);
+
+	return result;
+}
+
+/*
+ * Put page in local file cache.
+ * If cache is full then evict some other page.
+ */
+void
+lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+		  char *buffer)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+
+	if (found)
+	{
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->lru_node);
+	}
+	else
+	{
+		/*
+		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
+		 * 1. Wait until some of this operation is completed and pages is unpinned
+		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
+		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
+		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
+		 * we prefer not to complicate code and use second approach.
+		 */
+		if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
+		{
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			Assert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
+			elog(LOG, "Swap file cache page");
+		}
+		else
+			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
+		entry->access_count = 1;
+		memset(entry->bitmap, 0, sizeof entry->bitmap);
+	}
+	LWLockRelease(lfc_lock);
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+		}
+	}
+	if (lfc_desc > 0)
+	{
+		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+		if (rc != BLCKSZ)
+		{
+			elog(INFO, "Failed to write file cache: %m");
+			lfc_size_limit = 0; /* disable file cache */
+		}
+	}
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	Assert(entry->access_count > 0);
+	if (--entry->access_count == 0)
+		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	if (lfc_size_limit != 0)
+		entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
+	LWLockRelease(lfc_lock);
+}
+
+
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+	uint32		pageoffs;
+	Oid			relfilenode;
+	Oid			reltablespace;
+	Oid			reldatabase;
+	ForkNumber	forknum;
+	BlockNumber blocknum;
+	uint16		accesscount;
+} LocalCachePagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+	TupleDesc	tupdesc;
+	LocalCachePagesRec *record;
+} LocalCachePagesContext;
+
+/*
+ * Function returning data from the local file cache
+ * relation node/tablespace/database/blocknum and access_counter
+ */
+PG_FUNCTION_INFO_V1(local_cache_pages);
+
+#define NUM_LOCALCACHE_PAGES_ELEM	7
+
+Datum
+local_cache_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	Datum		result;
+	MemoryContext oldcontext;
+	LocalCachePagesContext *fctx;	/* User function context. */
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+        HASH_SEQ_STATUS status;
+		FileCacheEntry* entry;
+		uint32 n_pages = 0;
+		uint32 i;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* Create a user function context for cross-call persistence */
+		fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext));
+
+		/*
+		 * To smoothly support upgrades from version 1.0 of this extension
+		 * transparently handle the (non-)existence of the pinning_backends
+		 * column. We unfortunately have to get the result type for that... -
+		 * we can't use the result type determined by the function definition
+		 * without potentially crashing when somebody uses the old (or even
+		 * wrong) function definition though.
+		 */
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
+						   INT2OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount",
+						   INT4OID, -1, 0);
+
+		fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+		LWLockAcquire(lfc_lock, LW_SHARED);
+
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
+		}
+		fctx->record = (LocalCachePagesRec *)
+			MemoryContextAllocHuge(CurrentMemoryContext,
+								   sizeof(LocalCachePagesRec) * n_pages);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = n_pages;
+		funcctx->user_fctx = fctx;
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		/*
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
+		 */
+		n_pages = 0;
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+			{
+				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
+				{
+					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
+					fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
+					fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
+					fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
+					fctx->record[n_pages].forknum = entry->key.forkNum;
+					fctx->record[n_pages].blocknum = entry->key.blockNum + i;
+					fctx->record[n_pages].accesscount = entry->access_count;
+					n_pages += 1;
+				}
+			}
+		}
+		Assert(n_pages == funcctx->max_calls);
+		LWLockRelease(lfc_lock);
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+		Datum		values[NUM_LOCALCACHE_PAGES_ELEM];
+		bool		nulls[NUM_LOCALCACHE_PAGES_ELEM] = {
+			false, false, false, false, false, false, false
+		};
+
+		values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs);
+		values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode);
+		values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
+		values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
+		values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
+		values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
+		values[6] = Int32GetDatum(fctx->record[i].accesscount);
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 1aba2e1ede..5f134e3924 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -516,4 +516,5 @@ pg_init_libpagestore(void)
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
 	}
+	lfc_init();
 }
diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql
index 58b98a5923..6cf111ea6a 100644
--- a/pgxn/neon/neon--1.0.sql
+++ b/pgxn/neon/neon--1.0.sql
@@ -22,3 +22,13 @@ AS 'MODULE_PATHNAME', 'backpressure_throttling_time'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
+CREATE FUNCTION local_cache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'local_cache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW local_cache AS
+	SELECT P.* FROM local_cache_pages() AS P
+	(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, accesscount int4);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 170a0cb72d..831756b849 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -203,4 +203,11 @@ extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumbe
 extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 
+/* functions for local file cache */
+extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_init(void);
+
+
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 900f44ca10..0b34cb3ca9 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1684,6 +1684,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
+	lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
@@ -1757,6 +1759,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
+		return false;
+
 	tag = (BufferTag) {
 		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
@@ -1899,6 +1904,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case T_NeonGetPageResponse:
 			memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
+			lfc_write(rnode, forkNum, blkno, buffer);
 			break;
 
 		case T_NeonErrorResponse:
@@ -1950,6 +1956,12 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	/* Try to read from local file cache */
+	if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
+	{
+		return;
+	}
+
 	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
 	neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
@@ -2111,6 +2123,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
+	lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwrite(reln, forknum, blocknum, buffer, skipFsync);

From 1ad6e186bcb72f096eebddf022e08f01cc99d1aa Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 27 Dec 2022 12:34:48 +0400
Subject: [PATCH 151/167] Refuse ProposerElected if it is going to truncate
 correct WAL.

Prevents commit_lsn monotonicity violation (otherwise harmless).

closes https://github.com/neondatabase/neon/issues/3069
---
 safekeeper/src/safekeeper.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 2c13f81476..a70ae247b7 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -727,6 +727,24 @@ where
             return Ok(None);
         }
 
+        // This might happen in a rare race when another (old) connection from
+        // the same walproposer writes + flushes WAL after this connection
+        // already sent flush_lsn in VoteRequest. It is generally safe to
+        // proceed, but to prevent commit_lsn surprisingly going down we should
+        // either refuse the session (simpler) or skip the part we already have
+        // from the stream (can be implemented).
+        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
+                   msg.term, self.flush_lsn(), msg.start_streaming_at)
+        }
+        // Otherwise this shouldn't happen.
+        assert!(
+            msg.start_streaming_at >= self.inmem.commit_lsn,
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
+            msg.start_streaming_at,
+            self.inmem.commit_lsn
+        );
+
         // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
         // intersection of our history and history from msg
 

From fee8bf3a1717dd4f997e7a48fb5e3d6a8333a44b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 27 Dec 2022 15:55:33 +0400
Subject: [PATCH 152/167] Remove global_commit_lsn.

It is complicated and fragile to maintain and not really needed; update
commit_lsn locally only when we have enough WAL flushed.

ref https://github.com/neondatabase/neon/issues/3069
---
 safekeeper/src/safekeeper.rs | 41 +++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index a70ae247b7..5b1b686529 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -182,7 +182,7 @@ pub struct SafeKeeperState {
     /// All WAL segments next to one containing local_start_lsn are
     /// filled with data from the beginning.
     pub local_start_lsn: Lsn,
-    /// Part of WAL acknowledged by quorum and available locally. Always points
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
     /// to record boundary.
     pub commit_lsn: Lsn,
     /// LSN that points to the end of the last backed up segment. Useful to
@@ -501,10 +501,6 @@ impl AcceptorProposerMessage {
 /// - messages from compute (proposers) and provides replies
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
-    /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn.
-    /// Note: be careful to set only if we are sure our WAL (term history) matches
-    /// committed one.
-    pub global_commit_lsn: Lsn,
     /// LSN since the proposer safekeeper currently talking to appends WAL;
     /// determines epoch switch point.
     pub epoch_start_lsn: Lsn,
@@ -537,7 +533,6 @@ where
         }
 
         Ok(SafeKeeper {
-            global_commit_lsn: state.commit_lsn,
             epoch_start_lsn: Lsn(0),
             inmem: SafekeeperMemState {
                 commit_lsn: state.commit_lsn,
@@ -777,7 +772,6 @@ where
             // NB: on new clusters, this happens at the same time as
             // timeline_start_lsn initialization, it is taken outside to provide
             // upgrade.
-            self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn);
             self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn);
 
             // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
@@ -796,10 +790,21 @@ where
         Ok(None)
     }
 
-    /// Advance commit_lsn taking into account what we have locally
-    fn update_commit_lsn(&mut self) -> Result<()> {
-        let commit_lsn = min(self.global_commit_lsn, self.flush_lsn());
-        assert!(commit_lsn >= self.inmem.commit_lsn);
+    /// Advance commit_lsn taking into account what we have locally.
+    ///
+    /// Note: it is assumed that 'WAL we have is from the right term' check has
+    /// already been done outside.
+    fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
+        // Both peers and walproposer communicate this value, we might already
+        // have a fresher (higher) version.
+        candidate = max(candidate, self.inmem.commit_lsn);
+        let commit_lsn = min(candidate, self.flush_lsn());
+        assert!(
+            commit_lsn >= self.inmem.commit_lsn,
+            "commit_lsn monotonicity violated: old={} new={}",
+            self.inmem.commit_lsn,
+            commit_lsn
+        );
 
         self.inmem.commit_lsn = commit_lsn;
 
@@ -865,14 +870,11 @@ where
             self.wal_store.flush_wal()?;
         }
 
-        // Update global_commit_lsn
+        // Update commit_lsn.
         if msg.h.commit_lsn != Lsn(0) {
-            // We also obtain commit lsn from peers, so value arrived here might be stale (less)
-            self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn);
+            self.update_commit_lsn(msg.h.commit_lsn)?;
         }
-
         self.inmem.peer_horizon_lsn = msg.h.truncate_lsn;
-        self.update_commit_lsn()?;
 
         // Update truncate and commit LSN in control file.
         // To avoid negative impact on performance of extra fsync, do it only
@@ -904,10 +906,6 @@ where
     /// Flush WAL to disk. Return AppendResponse with latest LSNs.
     fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
         self.wal_store.flush_wal()?;
-
-        // commit_lsn can be updated because we have new flushed data locally.
-        self.update_commit_lsn()?;
-
         Ok(Some(AcceptorProposerMessage::AppendResponse(
             self.append_response(),
         )))
@@ -922,8 +920,7 @@ where
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
             if sk_info.last_log_term == self.get_epoch() {
-                self.global_commit_lsn = max(Lsn(sk_info.commit_lsn), self.global_commit_lsn);
-                self.update_commit_lsn()?;
+                self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
             }
         }
 

From f6bf7b20030c2520ee952fff78442d77f19506c1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 27 Dec 2022 16:33:19 +0400
Subject: [PATCH 153/167] Add tenant_id to safekeeper spans.

Now that it's hard to map timeline id into project in the console, this should
help a little.
---
 safekeeper/src/receive_wal.rs |  4 ++--
 safekeeper/src/safekeeper.rs  | 15 +++++++++------
 safekeeper/src/send_wal.rs    |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 6577e8c4d6..be7f071abb 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -52,7 +52,7 @@ impl<'pg> ReceiveWalConn<'pg> {
 
     /// Receive WAL from wal_proposer
     pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> {
-        let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered();
+        let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
 
         // Notify the libpq client that it's allowed to send `CopyData` messages
         self.pg_backend
@@ -69,7 +69,7 @@ impl<'pg> ReceiveWalConn<'pg> {
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
-                    "start handshake with wal proposer {} sysid {} timeline {}",
+                    "start handshake with walproposer {} sysid {} timeline {}",
                     self.peer_addr, greeting.system_id, greeting.tli,
                 );
                 let server_info = ServerInfo {
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 5b1b686529..fa973a3ede 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -634,10 +634,12 @@ where
 
         // system_id will be updated on mismatch
         if self.state.server.system_id != msg.system_id {
-            warn!(
-                "unexpected system ID arrived, got {}, expected {}",
-                msg.system_id, self.state.server.system_id
-            );
+            if self.state.server.system_id != 0 {
+                warn!(
+                    "unexpected system ID arrived, got {}, expected {}",
+                    msg.system_id, self.state.server.system_id
+                );
+            }
 
             let mut state = self.state.clone();
             state.server.system_id = msg.system_id;
@@ -648,8 +650,9 @@ where
         }
 
         info!(
-            "processed greeting from proposer {:?}, sending term {:?}",
-            msg.proposer_id, self.state.acceptor_state.term
+            "processed greeting from walproposer {}, sending term {:?}",
+            msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
+            self.state.acceptor_state.term
         );
         Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
             term: self.state.acceptor_state.term,
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a3481430d0..a054b8fe14 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -161,7 +161,7 @@ impl ReplicationConn {
         pgb: &mut PostgresBackend,
         mut start_pos: Lsn,
     ) -> Result<()> {
-        let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered();
+        let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
 
         let tli = GlobalTimelines::get(spg.ttid)?;
 

From 0c7b02ebc35e07bfaad70a0c4912a21a642e8bf8 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 28 Dec 2022 09:20:01 +0200
Subject: [PATCH 154/167] Move tenant related files to tenant directory (#3214)

Related to https://github.com/neondatabase/neon/issues/3208
---
 pageserver/src/billing_metrics.rs             |  6 +--
 pageserver/src/bin/pageserver.rs              |  5 ++-
 pageserver/src/config.rs                      |  5 ++-
 pageserver/src/http/routes.rs                 | 42 +++++++++----------
 pageserver/src/lib.rs                         |  9 ++--
 pageserver/src/page_service.rs                |  4 +-
 pageserver/src/tenant.rs                      | 11 +++--
 .../{tenant_config.rs => tenant/config.rs}    |  0
 .../src/{tenant_mgr.rs => tenant/mgr.rs}      |  5 +--
 .../src/{tenant_tasks.rs => tenant/tasks.rs}  |  4 +-
 pageserver/src/tenant/timeline.rs             |  2 +-
 11 files changed, 48 insertions(+), 45 deletions(-)
 rename pageserver/src/{tenant_config.rs => tenant/config.rs} (100%)
 rename pageserver/src/{tenant_mgr.rs => tenant/mgr.rs} (99%)
 rename pageserver/src/{tenant_tasks.rs => tenant/tasks.rs} (98%)

diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
index f9d3e8553f..73e27618db 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -9,7 +9,7 @@ use tracing::*;
 use utils::id::TimelineId;
 
 use crate::task_mgr;
-use crate::tenant_mgr;
+use crate::tenant::mgr;
 use pageserver_api::models::TenantState;
 use utils::id::TenantId;
 
@@ -161,7 +161,7 @@ pub async fn collect_metrics_task(
     );
 
     // get list of tenants
-    let tenants = tenant_mgr::list_tenants().await;
+    let tenants = mgr::list_tenants().await;
 
     // iterate through list of Active tenants and collect metrics
     for (tenant_id, tenant_state) in tenants {
@@ -169,7 +169,7 @@ pub async fn collect_metrics_task(
             continue;
         }
 
-        let tenant = tenant_mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
 
         let mut tenant_resident_size = 0;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d12063f5aa..2b4dcc68f0 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,7 +18,8 @@ use pageserver::{
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
     },
-    tenant_mgr, virtual_file,
+    tenant::mgr,
+    virtual_file,
 };
 use utils::{
     auth::JwtAuth,
@@ -284,7 +285,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let remote_storage = create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
 
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 66f8a9f4b8..deb79531a4 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -27,14 +27,15 @@ use utils::{
     postgres_backend::AuthType,
 };
 
+use crate::tenant::config::TenantConf;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
-use crate::tenant_config::{TenantConf, TenantConfOpt};
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 pub mod defaults {
-    use crate::tenant_config::defaults::*;
+    use crate::tenant::config::defaults::*;
     use const_format::formatcp;
 
     pub use pageserver_api::{
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6d97f3206e..66a1607801 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,9 +12,9 @@ use super::models::{
     TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{with_ondemand_download, Timeline};
-use crate::tenant_config::TenantConfOpt;
-use crate::{config::PageServerConf, tenant_mgr};
+use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
     auth::JwtAuth,
     http::{
@@ -170,7 +170,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     match tenant.create_timeline(
@@ -199,7 +199,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     check_permission(&request, Some(tenant_id))?;
 
     let response_data = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
             .await
             .map_err(ApiError::NotFound)?;
         let timelines = tenant.list_timelines();
@@ -262,7 +262,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     check_permission(&request, Some(tenant_id))?;
 
     let timeline_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
             .await
             .map_err(ApiError::NotFound)?;
 
@@ -294,7 +294,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
-    let timeline = tenant_mgr::get_tenant(tenant_id, true)
+    let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
@@ -322,7 +322,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     if let Some(remote_storage) = &state.remote_storage {
         // FIXME: distinguish between "Tenant already exists" and other errors
-        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
             .map_err(ApiError::InternalServerError)?;
@@ -340,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    tenant_mgr::delete_timeline(tenant_id, timeline_id)
+    mgr::delete_timeline(tenant_id, timeline_id)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -357,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     let state = get_state(&request);
     let conf = state.conf;
-    tenant_mgr::detach_tenant(conf, tenant_id)
+    mgr::detach_tenant(conf, tenant_id)
         .instrument(info_span!("tenant_detach", tenant = %tenant_id))
         .await
         // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
@@ -372,7 +372,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     check_permission(&request, Some(tenant_id))?;
 
     let state = get_state(&request);
-    tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
         .map_err(ApiError::InternalServerError)?;
@@ -386,7 +386,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 
     let state = get_state(&request);
     let conf = state.conf;
-    tenant_mgr::ignore_tenant(conf, tenant_id)
+    mgr::ignore_tenant(conf, tenant_id)
         .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
         .await
         // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
@@ -399,7 +399,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
-    let response_data = tenant_mgr::list_tenants()
+    let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
         .iter()
@@ -419,7 +419,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     check_permission(&request, Some(tenant_id))?;
 
     let tenant_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;
 
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
@@ -446,7 +446,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -567,7 +567,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
 
     let state = get_state(&request);
 
-    let new_tenant = tenant_mgr::create_tenant(
+    let new_tenant = mgr::create_tenant(
         state.conf,
         tenant_conf,
         target_tenant_id,
@@ -669,7 +669,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
     }
 
     let state = get_state(&request);
-    tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
+    mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", tenant = ?tenant_id))
         .await
         // FIXME: `update_tenant_config` can fail because of both user and internal errors.
@@ -721,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -738,7 +738,7 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
@@ -759,7 +759,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
@@ -784,7 +784,7 @@ async fn timeline_download_remote_layers_handler_post(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
@@ -803,7 +803,7 @@ async fn timeline_download_remote_layers_handler_get(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ae815fe421..80b05a76a6 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,9 +13,7 @@ pub mod profiling;
 pub mod repository;
 pub mod task_mgr;
 pub mod tenant;
-pub mod tenant_config;
-pub mod tenant_mgr;
-pub mod tenant_tasks;
+
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
@@ -25,9 +23,8 @@ pub mod walredo;
 
 use std::path::Path;
 
-use tracing::info;
-
 use crate::task_mgr::TaskKind;
+use tracing::info;
 
 /// Current storage format version
 ///
@@ -56,7 +53,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
 
     // Shut down all the tenants. This flushes everything to disk and kills
     // the checkpoint and GC tasks.
-    tenant_mgr::shutdown_all_tenants().await;
+    tenant::mgr::shutdown_all_tenants().await;
 
     // Stop syncing with remote storage.
     //
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 9b52fdaf68..b84b2694f4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,8 +48,8 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, Timeline};
-use crate::tenant_mgr;
 use crate::trace::Tracer;
 
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -948,7 +948,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
 async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;
     match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
         Ok(wait_result) => wait_result
             // no .context(), the error message is good enough and some tests depend on it
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 308130c799..eb28e6da0a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,11 +55,12 @@ use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
-use crate::tenant_config::TenantConfOpt;
+
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
@@ -84,6 +85,10 @@ mod par_fsync;
 pub mod storage_layer;
 mod storage_sync;
 
+pub mod config;
+pub mod mgr;
+pub mod tasks;
+
 mod timeline;
 
 pub mod size;
@@ -1422,7 +1427,7 @@ impl Tenant {
 
                     // Spawn gc and compaction loops. The loops will shut themselves
                     // down when they notice that the tenant is inactive.
-                    crate::tenant_tasks::start_background_loops(self.tenant_id);
+                    tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
                         timeline.set_state(TimelineState::Active);
@@ -2576,7 +2581,7 @@ pub mod harness {
     };
 
     use super::*;
-    use crate::tenant_config::{TenantConf, TenantConfOpt};
+    use crate::tenant::config::{TenantConf, TenantConfOpt};
     use hex_literal::hex;
     use utils::id::{TenantId, TimelineId};
 
diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant/config.rs
similarity index 100%
rename from pageserver/src/tenant_config.rs
rename to pageserver/src/tenant/config.rs
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant/mgr.rs
similarity index 99%
rename from pageserver/src/tenant_mgr.rs
rename to pageserver/src/tenant/mgr.rs
index e4e9d0c6e8..44849de735 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -17,8 +17,8 @@ use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::task_mgr::{self, TaskKind};
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_config::TenantConfOpt;
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
@@ -216,8 +216,7 @@ pub async fn create_tenant(
         hash_map::Entry::Vacant(v) => {
             // Hold the write_tenants() lock, since all of this is local IO.
             // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory =
-                super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
+            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
             let created_tenant =
                 schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
             let crated_tenant_id = created_tenant.tenant_id();
diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant/tasks.rs
similarity index 98%
rename from pageserver/src/tenant_tasks.rs
rename to pageserver/src/tenant/tasks.rs
index d71f244725..8397d26e5d 100644
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,8 +8,8 @@ use std::time::Duration;
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_mgr;
 use tracing::*;
 use utils::id::TenantId;
 
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
     wait: Duration,
 ) -> ControlFlow<(), Arc<Tenant>> {
     let tenant = loop {
-        match tenant_mgr::get_tenant(tenant_id, false).await {
+        match mgr::get_tenant(tenant_id, false).await {
             Ok(tenant) => break tenant,
             Err(e) => {
                 error!("Failed to get a tenant {tenant_id}: {e:#}");
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 55ede57e53..bbfcad5734 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -42,7 +42,7 @@ use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
-use crate::tenant_config::TenantConfOpt;
+use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 
 use postgres_connection::PgConnectionConfig;

From 172c7e5f92d03f3a78265771b465ad28b7615e43 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 28 Dec 2022 15:12:06 +0200
Subject: [PATCH 155/167] Split upload queue code from storage_sync.rs (#3216)

https://github.com/neondatabase/neon/issues/3208
---
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/tenant.rs                      |  22 +-
 ...rage_sync.rs => remote_timeline_client.rs} | 259 ++----------------
 .../delete.rs                                 |   0
 .../download.rs                               |   0
 .../index.rs                                  |   0
 .../upload.rs                                 |   2 +-
 .../src/tenant/storage_layer/remote_layer.rs  |   2 +-
 pageserver/src/tenant/timeline.rs             |   8 +-
 pageserver/src/tenant/upload_queue.rs         | 213 ++++++++++++++
 10 files changed, 262 insertions(+), 245 deletions(-)
 rename pageserver/src/tenant/{storage_sync.rs => remote_timeline_client.rs} (85%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/delete.rs (100%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/download.rs (100%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/index.rs (100%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/upload.rs (97%)
 create mode 100644 pageserver/src/tenant/upload_queue.rs

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 80b05a76a6..29050a5bc2 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,7 +13,6 @@ pub mod profiling;
 pub mod repository;
 pub mod task_mgr;
 pub mod tenant;
-
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eb28e6da0a..4c93490177 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,9 +45,7 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::metadata::TimelineMetadata;
-use self::storage_sync::create_remote_timeline_client;
-use self::storage_sync::index::IndexPart;
-use self::storage_sync::RemoteTimelineClient;
+use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
 use crate::import_datadir;
 use crate::is_uninit_mark;
@@ -57,6 +55,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -82,12 +81,13 @@ pub mod layer_map;
 
 pub mod metadata;
 mod par_fsync;
+mod remote_timeline_client;
 pub mod storage_layer;
-mod storage_sync;
 
 pub mod config;
 pub mod mgr;
 pub mod tasks;
+pub mod upload_queue;
 
 mod timeline;
 
@@ -648,8 +648,12 @@ impl Tenant {
             .as_ref()
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
-        let remote_timelines =
-            storage_sync::list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
+        let remote_timelines = remote_timeline_client::list_remote_timelines(
+            remote_storage,
+            self.conf,
+            self.tenant_id,
+        )
+        .await?;
 
         info!("found {} timelines", remote_timelines.len());
 
@@ -733,7 +737,7 @@ impl Tenant {
             .context("Failed to create new timeline directory")?;
 
         let remote_client =
-            create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?;
+            RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?;
 
         let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
             let timelines = self.timelines.lock().unwrap();
@@ -995,7 +999,7 @@ impl Tenant {
             .remote_storage
             .as_ref()
             .map(|remote_storage| {
-                create_remote_timeline_client(
+                RemoteTimelineClient::new(
                     remote_storage.clone(),
                     self.conf,
                     self.tenant_id,
@@ -2192,7 +2196,7 @@ impl Tenant {
         let tenant_id = self.tenant_id;
 
         let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = create_remote_timeline_client(
+            let remote_client = RemoteTimelineClient::new(
                 remote_storage.clone(),
                 self.conf,
                 tenant_id,
diff --git a/pageserver/src/tenant/storage_sync.rs b/pageserver/src/tenant/remote_timeline_client.rs
similarity index 85%
rename from pageserver/src/tenant/storage_sync.rs
rename to pageserver/src/tenant/remote_timeline_client.rs
index ef57f91a02..e27b0a8133 100644
--- a/pageserver/src/tenant/storage_sync.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -58,7 +58,7 @@
 //! To have a consistent remote structure, it's important that uploads and
 //! deletions are performed in the right order. For example, the index file
 //! contains a list of layer files, so it must not be uploaded until all the
-//! layer files that are in its list have been succesfully uploaded.
+//! layer files that are in its list have been successfully uploaded.
 //!
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
@@ -140,7 +140,7 @@
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
 //! the file is leaked in the remote storage. Similarly, if a new file is created
-//! and uploaded, but the pageserver dies permantently before updating the
+//! and uploaded, but the pageserver dies permanently before updating the
 //! remote index file, the new file is leaked in remote storage. We accept and
 //! tolerate that for now.
 //! Note further that we cannot easily fix this by scheduling deletes for every
@@ -207,30 +207,30 @@ mod upload;
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
-use std::ops::DerefMut;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
+use std::ops::DerefMut;
 use tokio::runtime::Runtime;
 use tracing::{info, warn};
 use tracing::{info_span, Instrument};
-
 use utils::lsn::Lsn;
 
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
 use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
-use crate::tenant::storage_sync::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
     config::PageServerConf,
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
     tenant::metadata::TimelineMetadata,
+    tenant::upload_queue::{
+        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
+    },
     {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };
 
@@ -286,206 +286,30 @@ pub struct RemoteTimelineClient {
     storage_impl: GenericRemoteStorage,
 }
 
-// clippy warns that Uninitialized is much smaller than Initialized, which wastes
-// memory for Uninitialized variants. Doesn't matter in practice, there are not
-// that many upload queues in a running pageserver, and most of them are initialized
-// anyway.
-#[allow(clippy::large_enum_variant)]
-enum UploadQueue {
-    Uninitialized,
-    Initialized(UploadQueueInitialized),
-    Stopped(UploadQueueStopped),
-}
-
-impl UploadQueue {
-    fn as_str(&self) -> &'static str {
-        match self {
-            UploadQueue::Uninitialized => "Uninitialized",
-            UploadQueue::Initialized(_) => "Initialized",
-            UploadQueue::Stopped(_) => "Stopped",
-        }
-    }
-}
-
-/// This keeps track of queued and in-progress tasks.
-struct UploadQueueInitialized {
-    /// Counter to assign task IDs
-    task_counter: u64,
-
-    /// All layer files stored in the remote storage, taking into account all
-    /// in-progress and queued operations
-    latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-
-    /// How many file uploads or deletions been scheduled, since the
-    /// last (scheduling of) metadata index upload?
-    latest_files_changes_since_metadata_upload_scheduled: u64,
-
-    /// Metadata stored in the remote storage, taking into account all
-    /// in-progress and queued operations.
-    /// DANGER: do not return to outside world, e.g., safekeepers.
-    latest_metadata: TimelineMetadata,
-
-    /// `disk_consistent_lsn` from the last metadata file that was successfully
-    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
-    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
-    /// Safekeeper can rely on it to make decisions for WAL storage.
-    last_uploaded_consistent_lsn: Lsn,
-
-    // Breakdown of different kinds of tasks currently in-progress
-    num_inprogress_layer_uploads: usize,
-    num_inprogress_metadata_uploads: usize,
-    num_inprogress_deletions: usize,
-
-    /// Tasks that are currently in-progress. In-progress means that a tokio Task
-    /// has been launched for it. An in-progress task can be busy uploading, but it can
-    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
-    /// be waiting for retry in `exponential_backoff`.
-    inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
-
-    /// Queued operations that have not been launched yet. They might depend on previous
-    /// tasks to finish. For example, metadata upload cannot be performed before all
-    /// preceding layer file uploads have completed.
-    queued_operations: VecDeque<UploadOp>,
-}
-
-struct UploadQueueStopped {
-    last_uploaded_consistent_lsn: Lsn,
-}
-
-impl UploadQueue {
-    fn initialize_empty_remote(
-        &mut self,
-        metadata: &TimelineMetadata,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        info!("initializing upload queue for empty remote");
-
-        let state = UploadQueueInitialized {
-            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: HashMap::new(),
-            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialize_with_current_remote_index_part(
-        &mut self,
-        index_part: &IndexPart,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        for layer_name in &index_part.timeline_layers {
-            let layer_metadata = index_part
-                .layer_metadata
-                .get(layer_name)
-                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(layer_name.to_owned(), layer_metadata);
-        }
-
-        let index_part_metadata = index_part.parse_metadata()?;
-        info!(
-            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
-        );
-
-        let state = UploadQueueInitialized {
-            latest_files: files,
-            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => Ok(x),
-        }
-    }
-}
-
-/// An in-progress upload or delete task.
-#[derive(Debug)]
-struct UploadTask {
-    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    task_id: u64,
-    retries: AtomicU32,
-
-    op: UploadOp,
-}
-
-#[derive(Debug)]
-enum UploadOp {
-    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
-
-    /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
-
-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
-
-    /// Barrier. When the barrier operation is reached,
-    Barrier(tokio::sync::watch::Sender<()>),
-}
-
-impl std::fmt::Display for UploadOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            UploadOp::UploadLayer(path, metadata) => {
-                write!(
-                    f,
-                    "UploadLayer({}, size={:?})",
-                    path.file_name(),
-                    metadata.file_size()
-                )
-            }
-            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
-            UploadOp::Barrier(_) => write!(f, "Barrier"),
-        }
-    }
-}
-
 impl RemoteTimelineClient {
+    ///
+    /// Create a remote storage client for given timeline
+    ///
+    /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
+    /// by calling init_upload_queue.
+    ///
+    pub fn new(
+        remote_storage: GenericRemoteStorage,
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<RemoteTimelineClient> {
+        Ok(RemoteTimelineClient {
+            conf,
+            runtime: &BACKGROUND_RUNTIME,
+            tenant_id,
+            timeline_id,
+            storage_impl: remote_storage,
+            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
+        })
+    }
+
     /// Initialize the upload queue for a remote storage that already received
     /// an index file upload, i.e., it's not empty.
     /// The given `index_part` must be the one on the remote.
@@ -1156,29 +980,6 @@ impl RemoteTimelineClient {
     }
 }
 
-///
-/// Create a remote storage client for given timeline
-///
-/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
-/// by calling init_upload_queue.
-///
-pub fn create_remote_timeline_client(
-    remote_storage: GenericRemoteStorage,
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<RemoteTimelineClient> {
-    Ok(RemoteTimelineClient {
-        conf,
-        runtime: &BACKGROUND_RUNTIME,
-        tenant_id,
-        timeline_id,
-        storage_impl: remote_storage,
-        upload_queue: Mutex::new(UploadQueue::Uninitialized),
-        metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/storage_sync/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
similarity index 100%
rename from pageserver/src/tenant/storage_sync/delete.rs
rename to pageserver/src/tenant/remote_timeline_client/delete.rs
diff --git a/pageserver/src/tenant/storage_sync/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
similarity index 100%
rename from pageserver/src/tenant/storage_sync/download.rs
rename to pageserver/src/tenant/remote_timeline_client/download.rs
diff --git a/pageserver/src/tenant/storage_sync/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
similarity index 100%
rename from pageserver/src/tenant/storage_sync/index.rs
rename to pageserver/src/tenant/remote_timeline_client/index.rs
diff --git a/pageserver/src/tenant/storage_sync/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
similarity index 97%
rename from pageserver/src/tenant/storage_sync/upload.rs
rename to pageserver/src/tenant/remote_timeline_client/upload.rs
index 08cea6268b..5082fa1634 100644
--- a/pageserver/src/tenant/storage_sync/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,7 +5,7 @@ use fail::fail_point;
 use std::path::Path;
 use tokio::fs;
 
-use crate::{config::PageServerConf, tenant::storage_sync::index::IndexPart};
+use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
 
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index c2c11d7bff..33474bb4a2 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -3,8 +3,8 @@
 //!
 use crate::config::PageServerConf;
 use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::storage_sync::index::LayerFileMetadata;
 use anyhow::{bail, Result};
 use std::ops::Range;
 use std::path::PathBuf;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bbfcad5734..93eb643d12 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,11 +23,11 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
     RemoteLayer,
 };
-use crate::tenant::storage_sync::{self, index::LayerFileMetadata};
 use crate::tenant::{
     ephemeral_file::is_ephemeral_file,
     layer_map::{LayerMap, SearchResult},
@@ -64,9 +64,9 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
 
+use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{DeltaLayer, ImageLayer, Layer};
-use super::storage_sync::index::IndexPart;
-use super::storage_sync::RemoteTimelineClient;
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum FlushLoopState {
@@ -1122,7 +1122,7 @@ impl Timeline {
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
-            } else if storage_sync::is_temp_download_file(&direntry_path) {
+            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
                 info!(
                     "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
                     fname
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
new file mode 100644
index 0000000000..790b2f59aa
--- /dev/null
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -0,0 +1,213 @@
+use crate::metrics::RemoteOpFileKind;
+
+use super::storage_layer::LayerFileName;
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use std::collections::{HashMap, VecDeque};
+use std::fmt::Debug;
+
+use std::sync::Arc;
+use tracing::info;
+
+use std::sync::atomic::AtomicU32;
+use utils::lsn::Lsn;
+
+// clippy warns that Uninitialized is much smaller than Initialized, which wastes
+// memory for Uninitialized variants. Doesn't matter in practice, there are not
+// that many upload queues in a running pageserver, and most of them are initialized
+// anyway.
+#[allow(clippy::large_enum_variant)]
+pub(crate) enum UploadQueue {
+    Uninitialized,
+    Initialized(UploadQueueInitialized),
+    Stopped(UploadQueueStopped),
+}
+
+impl UploadQueue {
+    fn as_str(&self) -> &'static str {
+        match self {
+            UploadQueue::Uninitialized => "Uninitialized",
+            UploadQueue::Initialized(_) => "Initialized",
+            UploadQueue::Stopped(_) => "Stopped",
+        }
+    }
+}
+
+/// This keeps track of queued and in-progress tasks.
+pub(crate) struct UploadQueueInitialized {
+    /// Counter to assign task IDs
+    pub(crate) task_counter: u64,
+
+    /// All layer files stored in the remote storage, taking into account all
+    /// in-progress and queued operations
+    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+
+    /// How many file uploads or deletions been scheduled, since the
+    /// last (scheduling of) metadata index upload?
+    pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
+
+    /// Metadata stored in the remote storage, taking into account all
+    /// in-progress and queued operations.
+    /// DANGER: do not return to outside world, e.g., safekeepers.
+    pub(crate) latest_metadata: TimelineMetadata,
+
+    /// `disk_consistent_lsn` from the last metadata file that was successfully
+    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
+    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
+    /// Safekeeper can rely on it to make decisions for WAL storage.
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+
+    // Breakdown of different kinds of tasks currently in-progress
+    pub(crate) num_inprogress_layer_uploads: usize,
+    pub(crate) num_inprogress_metadata_uploads: usize,
+    pub(crate) num_inprogress_deletions: usize,
+
+    /// Tasks that are currently in-progress. In-progress means that a tokio Task
+    /// has been launched for it. An in-progress task can be busy uploading, but it can
+    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
+    /// be waiting for retry in `exponential_backoff`.
+    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+
+    /// Queued operations that have not been launched yet. They might depend on previous
+    /// tasks to finish. For example, metadata upload cannot be performed before all
+    /// preceding layer file uploads have completed.
+    pub(crate) queued_operations: VecDeque<UploadOp>,
+}
+
+pub(crate) struct UploadQueueStopped {
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+}
+
+impl UploadQueue {
+    pub(crate) fn initialize_empty_remote(
+        &mut self,
+        metadata: &TimelineMetadata,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        info!("initializing upload queue for empty remote");
+
+        let state = UploadQueueInitialized {
+            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
+            latest_files: HashMap::new(),
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: metadata.clone(),
+            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
+            // safekeepers from garbage-collecting anything.
+            last_uploaded_consistent_lsn: Lsn(0),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialize_with_current_remote_index_part(
+        &mut self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        for layer_name in &index_part.timeline_layers {
+            let layer_metadata = index_part
+                .layer_metadata
+                .get(layer_name)
+                .map(LayerFileMetadata::from)
+                .unwrap_or(LayerFileMetadata::MISSING);
+            files.insert(layer_name.to_owned(), layer_metadata);
+        }
+
+        let index_part_metadata = index_part.parse_metadata()?;
+        info!(
+            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
+            index_part_metadata.disk_consistent_lsn()
+        );
+
+        let state = UploadQueueInitialized {
+            latest_files: files,
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Initialized(x) => Ok(x),
+        }
+    }
+}
+
+/// An in-progress upload or delete task.
+#[derive(Debug)]
+pub(crate) struct UploadTask {
+    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
+    pub(crate) task_id: u64,
+    pub(crate) retries: AtomicU32,
+
+    pub(crate) op: UploadOp,
+}
+
+#[derive(Debug)]
+pub(crate) enum UploadOp {
+    /// Upload a layer file
+    UploadLayer(LayerFileName, LayerFileMetadata),
+
+    /// Upload the metadata file
+    UploadMetadata(IndexPart, Lsn),
+
+    /// Delete a file.
+    Delete(RemoteOpFileKind, LayerFileName),
+
+    /// Barrier. When the barrier operation is reached,
+    Barrier(tokio::sync::watch::Sender<()>),
+}
+
+impl std::fmt::Display for UploadOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            UploadOp::UploadLayer(path, metadata) => {
+                write!(
+                    f,
+                    "UploadLayer({}, size={:?})",
+                    path.file_name(),
+                    metadata.file_size()
+                )
+            }
+            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
+            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Barrier(_) => write!(f, "Barrier"),
+        }
+    }
+}

From 42c6ddef8edcd1e0dae8cba5a4c8c1f2b3d70589 Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Wed, 28 Dec 2022 20:52:24 +0200
Subject: [PATCH 156/167] Rename ZENITH_AUTH_TOKEN to NEON_AUTH_TOKEN

Changes are:

* Pageserver: start reading from NEON_AUTH_TOKEN by default.
  Warn if ZENITH_AUTH_TOKEN is used instead.
* Compute, Docs: fix the default token name.
* Control plane: change name of the token in configs and start
  sequences.

Compatibility:

* Control plane in tests: works, no compatibility expected.
* Control plane for local installations: never officially supported
  auth anyways. If someone did enable it, `pageserver.toml` should be updated
  with the new `neon.pageserver_connstring` and `neon.safekeeper_token_env`.
* Pageserver is backward compatible: you can run new Pageserver with old
  commands and environment configurations, but not vice-versa.
  The culprit is the hard-coded `NEON_AUTH_TOKEN`.
* Compute has no code changes. As long as you update its configuration
  file with `pageserver_connstring` in sync with the start up scripts,
  you are good to go.
* Safekeeper has no code changes and has never used `ZENITH_AUTH_TOKEN` in
  the first place.
---
 control_plane/src/compute.rs     | 12 ++++++------
 control_plane/src/pageserver.rs  |  2 +-
 docs/authentication.md           |  6 +++---
 pageserver/src/bin/pageserver.rs | 26 +++++++++++++++++++++-----
 pgxn/neon/libpagestore.c         |  2 +-
 5 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs
index 547aa14d39..8731cf2583 100644
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -201,7 +201,7 @@ impl PostgresNode {
             .stderr(Stdio::piped());
 
         if let Some(token) = auth_token {
-            cmd.env("ZENITH_AUTH_TOKEN", token);
+            cmd.env("NEON_AUTH_TOKEN", token);
         }
 
         let sync_handle = cmd
@@ -304,17 +304,17 @@ impl PostgresNode {
 
             // Set up authentication
             //
-            // $ZENITH_AUTH_TOKEN will be replaced with value from environment
+            // $NEON_AUTH_TOKEN will be replaced with value from environment
             // variable during compute pg startup. It is done this way because
             // otherwise user will be able to retrieve the value using SHOW
             // command or pg_settings
             let password = if let AuthType::NeonJWT = auth_type {
-                "$ZENITH_AUTH_TOKEN"
+                "$NEON_AUTH_TOKEN"
             } else {
                 ""
             };
             // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
-            // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
+            // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
             // We parse this string and build it back with token from env var, and for simplicity rebuild
             // uses only needed variables namely host, port, user, password.
             format!("postgresql://no_user:{password}@{host}:{port}")
@@ -323,7 +323,7 @@ impl PostgresNode {
         conf.append_line("");
         conf.append("neon.pageserver_connstring", &pageserver_connstr);
         if let AuthType::NeonJWT = auth_type {
-            conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN");
+            conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
         }
         conf.append("neon.tenant_id", &self.tenant_id.to_string());
         conf.append("neon.timeline_id", &self.timeline_id.to_string());
@@ -448,7 +448,7 @@ impl PostgresNode {
             self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
         );
         if let Some(token) = auth_token {
-            cmd.env("ZENITH_AUTH_TOKEN", token);
+            cmd.env("NEON_AUTH_TOKEN", token);
         }
 
         let pg_ctl = cmd.output().context("pg_ctl failed")?;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 0c2415965a..68e94b2fdc 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -320,7 +320,7 @@ impl PageServerNode {
             let token = self
                 .env
                 .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
+            vec![("NEON_AUTH_TOKEN".to_owned(), token)]
         } else {
             Vec::new()
         })
diff --git a/docs/authentication.md b/docs/authentication.md
index 0752fae19f..e22d7b700f 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL.
 
 #### Outgoing connections
 Compute connects to Pageserver for getting pages.
-The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`.
+The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
 The environment variable inside the connection string is substituted with
 the JWT token.
 
@@ -77,7 +77,7 @@ If the GUC is unset, no token is passed.
 
 Note that both tokens can be (and typically are) the same;
 the scope is the tenant and the token is usually passed through the
-`$ZENITH_AUTH_TOKEN` environment variable.
+`$NEON_AUTH_TOKEN` environment variable.
 
 ### Pageserver
 #### Overview
@@ -114,7 +114,7 @@ either of three values:
 Pageserver makes a connection to a Safekeeper for each active timeline.
 As Pageserver may want to access any timeline it has on the disk,
 it is given a blanket JWT token to access any data on any Safekeeper.
-This token is passed through an environment variable called `ZENITH_AUTH_TOKEN`
+This token is passed through an environment variable called `NEON_AUTH_TOKEN`
 (non-configurable as of writing this text).
 
 A better way _may be_ to store JWT token for each timeline next to it,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2b4dcc68f0..5246541375 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -264,19 +264,35 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
     info!("Using auth: {:#?}", conf.auth_type);
 
-    match var("ZENITH_AUTH_TOKEN") {
-        Ok(v) => {
+    // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
+    match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
+        (old, Ok(v)) => {
             info!("Loaded JWT token for authentication with Safekeeper");
+            if let Ok(v_old) = old {
+                warn!(
+                    "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
+                );
+                if v_old != v {
+                    warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
+                }
+            }
             pageserver::config::SAFEKEEPER_AUTH_TOKEN
                 .set(Arc::new(v))
                 .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
         }
-        Err(VarError::NotPresent) => {
+        (Ok(v), _) => {
+            info!("Loaded JWT token for authentication with Safekeeper");
+            warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
+            pageserver::config::SAFEKEEPER_AUTH_TOKEN
+                .set(Arc::new(v))
+                .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
+        }
+        (_, Err(VarError::NotPresent)) => {
             info!("No JWT token for authentication with Safekeeper detected");
         }
-        Err(e) => {
+        (_, Err(e)) => {
             return Err(e).with_context(|| {
-                "Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
+                "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
             })
         }
     };
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5f134e3924..c6199dddc0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -420,7 +420,7 @@ pg_init_libpagestore(void)
 							   NULL, NULL, NULL);
 
     DefineCustomStringVariable("neon.safekeeper_token_env",
-                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $ZENITH_AUTH_TOKEN",
+                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
                                NULL,
                                &safekeeper_token_env,
                                NULL,

From bd7a9e6274225eb1346811e661f6afb66d17a591 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 28 Dec 2022 17:48:49 +0200
Subject: [PATCH 157/167] switch to debug from info to produce less noise

---
 pageserver/src/tenant/remote_timeline_client.rs | 6 +++---
 pageserver/src/tenant/timeline.rs               | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e27b0a8133..45988ff47a 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -214,7 +214,7 @@ use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
@@ -675,7 +675,7 @@ impl RemoteTimelineClient {
             // We can launch this task. Remove it from the queue first.
             let next_op = upload_queue.queued_operations.pop_front().unwrap();
 
-            info!("starting op: {}", next_op);
+            debug!("starting op: {}", next_op);
 
             // Update the counters
             match next_op {
@@ -867,7 +867,7 @@ impl RemoteTimelineClient {
                 task.op, retries
             );
         } else {
-            info!("remote task {} completed successfully", task.op);
+            debug!("remote task {} completed successfully", task.op);
         }
 
         // The task has completed succesfully. Remove it from the in-progress list.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 93eb643d12..137c38ca85 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2593,7 +2593,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await
@@ -2807,7 +2807,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await

From f731e9b3de1089f4bc5d6fb683395eb428b516fe Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 29 Dec 2022 12:11:04 +0200
Subject: [PATCH 158/167] Fix serialization of billing metrics (#3215)

Fixes:
- serialize TenantId and TimelineId as strings,
- skip TimelineId if none
- serialize `metric_type` field as `type`
- add `idempotency_key` field to uniquely identify metrics
---
 pageserver/src/billing_metrics.rs | 50 +++++++++++++++++++++++--------
 pageserver/src/bin/pageserver.rs  |  1 +
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
index 73e27618db..3a6b83773d 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -6,6 +6,7 @@
 
 use anyhow;
 use tracing::*;
+use utils::id::NodeId;
 use utils::id::TimelineId;
 
 use crate::task_mgr;
@@ -14,12 +15,14 @@ use pageserver_api::models::TenantState;
 use utils::id::TenantId;
 
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::fmt;
 use std::str::FromStr;
 use std::time::Duration;
 
 use chrono::{DateTime, Utc};
+use rand::Rng;
 use reqwest::Url;
 
 /// BillingMetric struct that defines the format for one metric entry
@@ -30,27 +33,36 @@ use reqwest::Url;
 /// "metric": "remote_storage_size",
 /// "type": "absolute",
 /// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
-/// "timeline_id": "00000000000000000000000000000000",
-/// "time": ...,
+/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
+/// "time": "2022-12-28T11:07:19.317310284Z",
+/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
 /// "value": 12345454,
 /// }
 /// ```
+#[serde_as]
 #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct BillingMetric {
     pub metric: BillingMetricKind,
+    #[serde(rename = "type")]
     pub metric_type: &'static str,
+    #[serde_as(as = "DisplayFromStr")]
     pub tenant_id: TenantId,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub timeline_id: Option<TimelineId>,
     pub time: DateTime<Utc>,
+    pub idempotency_key: String,
     pub value: u64,
 }
 
 impl BillingMetric {
-    pub fn new_absolute(
+    pub fn new_absolute<R: Rng + ?Sized>(
         metric: BillingMetricKind,
         tenant_id: TenantId,
         timeline_id: Option<TimelineId>,
         value: u64,
+        node_id: NodeId,
+        rng: &mut R,
     ) -> Self {
         Self {
             metric,
@@ -58,6 +70,8 @@ impl BillingMetric {
             tenant_id,
             timeline_id,
             time: Utc::now(),
+            // key that allows metric collector to distinguish unique events
+            idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
             value,
         }
     }
@@ -123,6 +137,7 @@ struct EventChunk<'a> {
 pub async fn collect_metrics(
     metric_collection_endpoint: &Url,
     metric_collection_interval: Duration,
+    node_id: NodeId,
 ) -> anyhow::Result<()> {
     let mut ticker = tokio::time::interval(metric_collection_interval);
 
@@ -139,7 +154,7 @@ pub async fn collect_metrics(
                 return Ok(());
             },
             _ = ticker.tick() => {
-                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint).await?;
+                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
             }
         }
     }
@@ -153,6 +168,7 @@ pub async fn collect_metrics_task(
     client: &reqwest::Client,
     cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
+    node_id: NodeId,
 ) -> anyhow::Result<()> {
     let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
     trace!(
@@ -241,15 +257,23 @@ pub async fn collect_metrics_task(
 
     for chunk in chunks {
         chunk_to_send.clear();
-        // enrich metrics with timestamp and metric_kind before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
-            BillingMetric::new_absolute(
-                curr_key.metric,
-                curr_key.tenant_id,
-                curr_key.timeline_id,
-                *curr_val,
-            )
-        }));
+
+        // this code block is needed to convince compiler
+        // that rng is not reused aroung await point
+        {
+            // enrich metrics with timestamp and metric_kind before sending
+            let mut rng = rand::thread_rng();
+            chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
+                BillingMetric::new_absolute(
+                    curr_key.metric,
+                    curr_key.tenant_id,
+                    curr_key.timeline_id,
+                    *curr_val,
+                    node_id,
+                    &mut rng,
+                )
+            }));
+        }
 
         let chunk_json = serde_json::value::to_raw_value(&EventChunk {
             events: &chunk_to_send,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5246541375..4b71874bdf 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -341,6 +341,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                     pageserver::billing_metrics::collect_metrics(
                         metric_collection_endpoint,
                         conf.metric_collection_interval,
+                        conf.id,
                     )
                     .instrument(info_span!("metrics_collection"))
                     .await?;

From 0e7c03370e9df37ee9fea7a667d6fbb2885aec08 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Dec 2022 12:20:28 +0200
Subject: [PATCH 159/167] Lazy calculation of traversal_id which is needed only
 for error repoting (#3221)

See
https://neondb.slack.com/archives/C0277TKAJCA/p1672245908989789
and
https://neondb.slack.com/archives/C033RQ5SPDH/p1671885245981359
---
 pageserver/src/tenant/timeline.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 137c38ca85..951f217cf9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1588,7 +1588,7 @@ trait TraversalLayerExt {
 }
 
 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
-    fn traversal_id(&self) -> String {
+    fn traversal_id(&self) -> TraversalId {
         match self.local_path() {
             Some(local_path) => {
                 debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
@@ -1608,7 +1608,7 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
 }
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> String {
+    fn traversal_id(&self) -> TraversalId {
         format!(
             "timeline {} in-memory {}",
             self.get_timeline_id(),
@@ -1638,7 +1638,8 @@ impl Timeline {
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<(ValueReconstructResult, Lsn, TraversalId)>::new();
+        let mut traversal_path =
+            Vec::<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>::new();
 
         let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
             *cached_lsn
@@ -1726,7 +1727,7 @@ impl Timeline {
                         Err(e) => return PageReconstructResult::from(e),
                     };
                     cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
+                    traversal_path.push((result, cont_lsn, Box::new(open_layer.clone())));
                     continue;
                 }
             }
@@ -1744,7 +1745,7 @@ impl Timeline {
                         Err(e) => return PageReconstructResult::from(e),
                     };
                     cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
+                    traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone())));
                     continue 'outer;
                 }
             }
@@ -1771,7 +1772,7 @@ impl Timeline {
                     Err(e) => return PageReconstructResult::from(e),
                 };
                 cont_lsn = lsn_floor;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
+                traversal_path.push((result, cont_lsn, Box::new(layer.clone())));
             } else if timeline.ancestor_timeline.is_some() {
                 // Nothing on this timeline. Traverse to parent
                 result = ValueReconstructResult::Continue;
@@ -3344,7 +3345,7 @@ where
 /// to an error, as anyhow context information.
 fn layer_traversal_error(
     msg: String,
-    path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
+    path: Vec<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>,
 ) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
@@ -3353,7 +3354,9 @@ fn layer_traversal_error(
         .map(|(r, c, l)| {
             format!(
                 "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
+                r,
+                c,
+                l.traversal_id(),
             )
         })
         .chain(std::iter::once(msg));

From c0290467fa100c7e2c147e3b804e9a45d108b0b0 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Thu, 29 Dec 2022 12:33:30 +0200
Subject: [PATCH 160/167] Fix #2907 Remove missing_layers from IndexPart
 (#3217)

#2907
---
 .../src/tenant/remote_timeline_client/index.rs    | 15 +--------------
 .../regress/test_tenants_with_remote_storage.py   | 10 ++++++----
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 017be29726..c199b7e10b 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -83,11 +83,6 @@ where
     /// Additional metadata can might exist in `layer_metadata`.
     pub timeline_layers: HashSet<L>,
 
-    /// FIXME: unused field. This should be removed, but that changes the on-disk format,
-    /// so we need to make sure we're backwards-` (and maybe forwards-) compatible
-    /// First pass is to move it to Optional and the next would be its removal
-    missing_layers: Option<HashSet<L>>,
-
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -167,8 +162,6 @@ impl IndexPartUnclean {
         let IndexPartUnclean {
             version,
             timeline_layers,
-            // this is an unused field, ignore it on cleaning
-            missing_layers: _,
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -189,7 +182,6 @@ impl IndexPartUnclean {
                     }
                 })
                 .collect(),
-            missing_layers: None,
             layer_metadata: layer_metadata
                 .into_iter()
                 .filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
@@ -225,7 +217,6 @@ impl IndexPart {
         Self {
             version: Self::LATEST_VERSION,
             timeline_layers,
-            missing_layers: Some(HashSet::new()),
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -259,7 +250,6 @@ mod tests {
     fn v0_indexpart_is_parsed() {
         let example = r#"{
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "disk_consistent_lsn":"0/16960E8",
             "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
         }"#;
@@ -267,7 +257,6 @@ mod tests {
         let expected = IndexPart {
             version: 0,
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            missing_layers: None, // disabled fields should not carry unused values further
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -283,7 +272,6 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
                 "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
@@ -296,7 +284,6 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            missing_layers: None,
             layer_metadata: HashMap::from([
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                     file_size: Some(25600000),
@@ -322,6 +309,7 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
                 "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
@@ -346,7 +334,6 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            missing_layers: None,
         };
 
         let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6a5b4278da..6da6a4d446 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -229,7 +229,7 @@ def test_tenant_upgrades_index_json_from_v0(
         "timeline_layers":[
             "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"
         ],
-        "missing_layers":[],
+        "missing_layers":["This should not fail as its not used anymore"],
         "disk_consistent_lsn":"0/16960E8",
         "metadata_bytes":[]
     }"""
@@ -261,7 +261,6 @@ def test_tenant_upgrades_index_json_from_v0(
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
     pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -274,7 +273,10 @@ def test_tenant_upgrades_index_json_from_v0(
         # keep the deserialized for later inspection
         orig_index_part = json.load(timeline_file)
 
-        v0_index_part = {key: orig_index_part[key] for key in v0_skeleton}
+        v0_index_part = {
+            key: orig_index_part[key]
+            for key in v0_skeleton.keys() - ["missing_layers"]  # pgserver doesn't have it anymore
+        }
 
         timeline_file.seek(0)
         json.dump(v0_index_part, timeline_file)
@@ -306,7 +308,7 @@ def test_tenant_upgrades_index_json_from_v0(
     # make sure the file has been upgraded back to how it started
     index_part = local_fs_index_part(env, tenant_id, timeline_id)
     assert index_part["version"] == orig_index_part["version"]
-    assert index_part["missing_layers"] == orig_index_part["missing_layers"]
+    assert "missing_layers" not in index_part.keys()
 
     # expect one more layer because of the forced checkpoint
     assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1

From 894ac30734663f9b17645535771a47952d84b8f5 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 29 Dec 2022 12:28:58 +0200
Subject: [PATCH 161/167] Rename billing_metrics to consumption_metrics. Use
 more appropriate term, because not all of these metrics are used for billing.

---
 pageserver/src/bin/pageserver.rs              |  2 +-
 ...ling_metrics.rs => consumption_metrics.rs} | 54 +++++++++----------
 pageserver/src/lib.rs                         |  2 +-
 3 files changed, 29 insertions(+), 29 deletions(-)
 rename pageserver/src/{billing_metrics.rs => consumption_metrics.rs} (85%)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 4b71874bdf..b3d9b0f809 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -338,7 +338,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "consumption metrics collection",
                 true,
                 async move {
-                    pageserver::billing_metrics::collect_metrics(
+                    pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
                         conf.metric_collection_interval,
                         conf.id,
diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/consumption_metrics.rs
similarity index 85%
rename from pageserver/src/billing_metrics.rs
rename to pageserver/src/consumption_metrics.rs
index 3a6b83773d..0d96eb431d 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -25,7 +25,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use reqwest::Url;
 
-/// BillingMetric struct that defines the format for one metric entry
+/// ConsumptionMetric struct that defines the format for one metric entry
 /// i.e.
 ///
 /// ```json
@@ -41,8 +41,8 @@ use reqwest::Url;
 /// ```
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct BillingMetric {
-    pub metric: BillingMetricKind,
+pub struct ConsumptionMetric {
+    pub metric: ConsumptionMetricKind,
     #[serde(rename = "type")]
     pub metric_type: &'static str,
     #[serde_as(as = "DisplayFromStr")]
@@ -55,9 +55,9 @@ pub struct BillingMetric {
     pub value: u64,
 }
 
-impl BillingMetric {
+impl ConsumptionMetric {
     pub fn new_absolute<R: Rng + ?Sized>(
-        metric: BillingMetricKind,
+        metric: ConsumptionMetricKind,
         tenant_id: TenantId,
         timeline_id: Option<TimelineId>,
         value: u64,
@@ -79,7 +79,7 @@ impl BillingMetric {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
-pub enum BillingMetricKind {
+pub enum ConsumptionMetricKind {
     /// Amount of WAL produced , by a timeline, i.e. last_record_lsn
     /// This is an absolute, per-timeline metric.
     WrittenSize,
@@ -96,7 +96,7 @@ pub enum BillingMetricKind {
     RemoteStorageSize,
 }
 
-impl FromStr for BillingMetricKind {
+impl FromStr for ConsumptionMetricKind {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -110,27 +110,27 @@ impl FromStr for BillingMetricKind {
     }
 }
 
-impl fmt::Display for BillingMetricKind {
+impl fmt::Display for ConsumptionMetricKind {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_str(match self {
-            BillingMetricKind::WrittenSize => "written_size",
-            BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
-            BillingMetricKind::ResidentSize => "resident_size",
-            BillingMetricKind::RemoteStorageSize => "remote_storage_size",
+            ConsumptionMetricKind::WrittenSize => "written_size",
+            ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
+            ConsumptionMetricKind::ResidentSize => "resident_size",
+            ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
         })
     }
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct BillingMetricsKey {
+pub struct ConsumptionMetricsKey {
     tenant_id: TenantId,
     timeline_id: Option<TimelineId>,
-    metric: BillingMetricKind,
+    metric: ConsumptionMetricKind,
 }
 
 #[derive(serde::Serialize)]
 struct EventChunk<'a> {
-    events: &'a [BillingMetric],
+    events: &'a [ConsumptionMetric],
 }
 
 /// Main thread that serves metrics collection
@@ -145,7 +145,7 @@ pub async fn collect_metrics(
 
     // define client here to reuse it for all requests
     let client = reqwest::Client::new();
-    let mut cached_metrics: HashMap<BillingMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
 
     loop {
         tokio::select! {
@@ -166,11 +166,11 @@ pub async fn collect_metrics(
 /// Cache metrics to avoid sending the same metrics multiple times.
 pub async fn collect_metrics_task(
     client: &reqwest::Client,
-    cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
+    cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
 ) -> anyhow::Result<()> {
-    let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
     trace!(
         "starting collect_metrics_task. metric_collection_endpoint: {}",
         metric_collection_endpoint
@@ -194,10 +194,10 @@ pub async fn collect_metrics_task(
             let timeline_written_size = u64::from(timeline.get_last_record_lsn());
 
             current_metrics.push((
-                BillingMetricsKey {
+                ConsumptionMetricsKey {
                     tenant_id,
                     timeline_id: Some(timeline.timeline_id),
-                    metric: BillingMetricKind::WrittenSize,
+                    metric: ConsumptionMetricKind::WrittenSize,
                 },
                 timeline_written_size,
             ));
@@ -217,19 +217,19 @@ pub async fn collect_metrics_task(
         );
 
         current_metrics.push((
-            BillingMetricsKey {
+            ConsumptionMetricsKey {
                 tenant_id,
                 timeline_id: None,
-                metric: BillingMetricKind::ResidentSize,
+                metric: ConsumptionMetricKind::ResidentSize,
             },
             tenant_resident_size,
         ));
 
         current_metrics.push((
-            BillingMetricsKey {
+            ConsumptionMetricsKey {
                 tenant_id,
                 timeline_id: None,
-                metric: BillingMetricKind::RemoteStorageSize,
+                metric: ConsumptionMetricKind::RemoteStorageSize,
             },
             tenant_remote_size,
         ));
@@ -253,7 +253,7 @@ pub async fn collect_metrics_task(
     const CHUNK_SIZE: usize = 1000;
     let chunks = current_metrics.chunks(CHUNK_SIZE);
 
-    let mut chunk_to_send: Vec<BillingMetric> = Vec::with_capacity(1000);
+    let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
 
     for chunk in chunks {
         chunk_to_send.clear();
@@ -264,7 +264,7 @@ pub async fn collect_metrics_task(
             // enrich metrics with timestamp and metric_kind before sending
             let mut rng = rand::thread_rng();
             chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
-                BillingMetric::new_absolute(
+                ConsumptionMetric::new_absolute(
                     curr_key.metric,
                     curr_key.tenant_id,
                     curr_key.timeline_id,
@@ -278,7 +278,7 @@ pub async fn collect_metrics_task(
         let chunk_json = serde_json::value::to_raw_value(&EventChunk {
             events: &chunk_to_send,
         })
-        .expect("BillingMetric should not fail serialization");
+        .expect("ConsumptionMetric should not fail serialization");
 
         let res = client
             .post(metric_collection_endpoint.clone())
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 29050a5bc2..2f78c199b9 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,7 @@
 mod auth;
 pub mod basebackup;
-pub mod billing_metrics;
 pub mod config;
+pub mod consumption_metrics;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;

From 434fcac357ef0e5826be93fb7695e6faa1c70c38 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Fri, 23 Dec 2022 10:23:28 +0100
Subject: [PATCH 162/167] Remove unused HTTP endpoints from compute_ctl

---
 compute_tools/src/http/api.rs            | 30 +-------------
 compute_tools/src/http/openapi_spec.yaml | 51 ------------------------
 2 files changed, 1 insertion(+), 80 deletions(-)

diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 4c8bbc608b..44f83e5003 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use log::{error, info};
 use serde_json;
 
-use crate::compute::{ComputeNode, ComputeStatus};
+use crate::compute::ComputeNode;
 
 // Service function to handle all available routes.
 async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
     match (req.method(), req.uri().path()) {
-        // Timestamp of the last Postgres activity in the plain text.
-        // DEPRECATED in favour of /status
-        (&Method::GET, "/last_activity") => {
-            info!("serving /last_active GET request");
-            let state = compute.state.read().unwrap();
-
-            // Use RFC3339 format for consistency.
-            Response::new(Body::from(state.last_active.to_rfc3339()))
-        }
-
-        // Has compute setup process finished? -> true/false.
-        // DEPRECATED in favour of /status
-        (&Method::GET, "/ready") => {
-            info!("serving /ready GET request");
-            let status = compute.get_status();
-            Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
-        }
-
         // Serialized compute state.
         (&Method::GET, "/status") => {
             info!("serving /status GET request");
@@ -46,16 +28,6 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
             Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
         }
 
-        // DEPRECATED, use POST instead
-        (&Method::GET, "/check_writability") => {
-            info!("serving /check_writability GET request");
-            let res = crate::checker::check_writability(&compute).await;
-            match res {
-                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => Response::new(Body::from(e.to_string())),
-            }
-        }
-
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
             let res = crate::checker::check_writability(&compute).await;
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 9c0f8e3ccd..a857531d26 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,58 +37,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeMetrics"
 
-  /ready:
-    get:
-      deprecated: true
-      tags:
-      - "info"
-      summary: Check whether compute startup process finished successfully
-      description: ""
-      operationId: computeIsReady
-      responses:
-        "200":
-          description: Compute is ready ('true') or not ('false')
-          content:
-            text/plain:
-              schema:
-                type: string
-                example: "true"
-
-  /last_activity:
-    get:
-      deprecated: true
-      tags:
-      - "info"
-      summary: Get timestamp of the last compute activity
-      description: ""
-      operationId: getLastComputeActivityTS
-      responses:
-        "200":
-          description: Timestamp of the last compute activity
-          content:
-            text/plain:
-              schema:
-                type: string
-                example: "2022-10-12T07:20:50.52Z"
-
   /check_writability:
-    get:
-      deprecated: true
-      tags:
-      - "check"
-      summary: Check that we can write new data on this compute
-      description: ""
-      operationId: checkComputeWritabilityDeprecated
-      responses:
-        "200":
-          description: Check result
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Error text or 'true' if check passed
-                example: "true"
-
     post:
       tags:
       - "check"

From fefe19a284c851a4f74abe83f8d478263163260d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 28 Dec 2022 10:48:27 +0200
Subject: [PATCH 163/167] Avoid calling find_lsn_for_timestamp call while
 holding lock.

Refactor update_gc_info function so that it calls the potentially
expensive find_lsn_for_timestamp() function before acquiring the
lock. This will also be needed if we make find_lsn_for_timestamp()
async in the future; it cannot be awaited while holding the lock.
---
 pageserver/src/tenant/timeline.rs | 67 +++++++++++++++++++------------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 951f217cf9..ec4b3ae665 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2674,29 +2674,27 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
+    ///
+    /// NOTE: This function holds a short-lived lock to protect the 'gc_info'
+    /// field, so that the three values passed as argument are stored
+    /// atomically. But the caller is responsible for ensuring that no new
+    /// branches are created that would need to be included in 'retain_lsns',
+    /// for example. The caller should hold `Tenant::gc_cs` lock to ensure
+    /// that.
+    ///
     pub(super) async fn update_gc_info(
         &self,
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
     ) -> anyhow::Result<()> {
-        let mut gc_info = self.gc_info.write().unwrap();
-
-        gc_info.horizon_cutoff = cutoff_horizon;
-        gc_info.retain_lsns = retain_lsns;
-
-        // Calculate pitr cutoff point.
-        // If we cannot determine a cutoff LSN, be conservative and don't GC anything.
-        let mut pitr_cutoff_lsn: Lsn;
-
-        if pitr != Duration::ZERO {
-            // conservative, safe default is to remove nothing, when we have no
-            // commit timestamp data available
-            pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn();
-
-            // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-            // If we don't have enough data to convert to LSN,
-            // play safe and don't remove any layers.
+        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+        //
+        // Some unit tests depend on garbage-collection working even when
+        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
+        // work, so avoid calling it altogether if time-based retention is not
+        // configured. It would be pointless anyway.
+        let pitr_cutoff = if pitr != Duration::ZERO {
             let now = SystemTime::now();
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
@@ -2705,27 +2703,44 @@ impl Timeline {
                     .find_lsn_for_timestamp(pitr_timestamp)
                     .no_ondemand_download()?
                 {
-                    LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
+                    LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
+                        // The timestamp is in the future. That sounds impossible,
+                        // but what it really means is that there hasn't been
+                        // any commits since the cutoff timestamp.
                         debug!("future({})", lsn);
-                        pitr_cutoff_lsn = gc_info.horizon_cutoff;
+                        cutoff_horizon
                     }
                     LsnForTimestamp::Past(lsn) => {
                         debug!("past({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
                     }
                     LsnForTimestamp::NoData(lsn) => {
                         debug!("nodata({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
                     }
                 }
-                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
+            } else {
+                // If we don't have enough data to convert to LSN,
+                // play safe and don't remove any layers.
+                *self.get_latest_gc_cutoff_lsn()
             }
         } else {
-            // No time-based retention. (Some unit tests depend on garbage-collection
-            // working even when CLOG data is missing, so that find_lsn_for_timestamp()
-            // above doesn't work.)
-            pitr_cutoff_lsn = gc_info.horizon_cutoff;
-        }
-        gc_info.pitr_cutoff = pitr_cutoff_lsn;
+            // No time-based retention was configured. Set time-based cutoff to
+            // same as LSN based.
+            cutoff_horizon
+        };
+
+        // Grab the lock and update the values
+        *self.gc_info.write().unwrap() = GcInfo {
+            retain_lsns,
+            horizon_cutoff: cutoff_horizon,
+            pitr_cutoff,
+        };
 
         Ok(())
     }

From 890ff3803e2413be9e641ce0d2be23b3ea9b5a6d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 28 Dec 2022 10:48:33 +0200
Subject: [PATCH 164/167] Allow update_gc_info to download files on-demand.

---
 pageserver/src/tenant/timeline.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec4b3ae665..df02f24239 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2699,9 +2699,7 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self
-                    .find_lsn_for_timestamp(pitr_timestamp)
-                    .no_ondemand_download()?
+                match with_ondemand_download(|| self.find_lsn_for_timestamp(pitr_timestamp)).await?
                 {
                     LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {

From 8ff7bc5df1b7644825c5379474b7715f3bb2ab21 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 27 Dec 2022 19:45:09 +0200
Subject: [PATCH 165/167] Add timleline_logical_size metric. Send this metric
 only when it is fully calculated.

Make consumption metrics more stable:
- Send per-timeline metrics only for active timelines.
- Adjust test assertions to make test_metric_collection test more stable.
---
 pageserver/src/consumption_metrics.rs         | 43 +++++++++++++------
 pageserver/src/http/routes.rs                 |  2 +-
 pageserver/src/tenant/timeline.rs             |  8 +++-
 .../src/walreceiver/walreceiver_connection.rs |  7 +--
 test_runner/regress/test_metric_collection.py | 16 ++++++-
 5 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 0d96eb431d..c411a9e025 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -94,6 +94,9 @@ pub enum ConsumptionMetricKind {
     /// Size of the remote storage (S3) directory.
     /// This is an absolute, per-tenant metric.
     RemoteStorageSize,
+    /// Logical size of the data in the timeline
+    /// This is an absolute, per-timeline metric
+    TimelineLogicalSize,
 }
 
 impl FromStr for ConsumptionMetricKind {
@@ -105,6 +108,7 @@ impl FromStr for ConsumptionMetricKind {
             "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
             "resident_size" => Ok(Self::ResidentSize),
             "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            "timeline_logical_size" => Ok(Self::TimelineLogicalSize),
             _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
         }
     }
@@ -117,6 +121,7 @@ impl fmt::Display for ConsumptionMetricKind {
             ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
             ConsumptionMetricKind::ResidentSize => "resident_size",
             ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
+            ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
         })
     }
 }
@@ -191,23 +196,35 @@ pub async fn collect_metrics_task(
 
         // iterate through list of timelines in tenant
         for timeline in tenant.list_timelines().iter() {
-            let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());
 
-            current_metrics.push((
-                ConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: Some(timeline.timeline_id),
-                    metric: ConsumptionMetricKind::WrittenSize,
-                },
-                timeline_written_size,
-            ));
+                current_metrics.push((
+                    ConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: ConsumptionMetricKind::WrittenSize,
+                    },
+                    timeline_written_size,
+                ));
+
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                // Only send timeline logical size when it is fully calculated.
+                if is_exact {
+                    current_metrics.push((
+                        ConsumptionMetricsKey {
+                            tenant_id,
+                            timeline_id: Some(timeline.timeline_id),
+                            metric: ConsumptionMetricKind::TimelineLogicalSize,
+                        },
+                        timeline_logical_size,
+                    ));
+                }
+            }
 
             let timeline_resident_size = timeline.get_resident_physical_size();
             tenant_resident_size += timeline_resident_size;
-
-            debug!(
-                "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
-                tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
         }
 
         let tenant_remote_size = tenant.get_remote_size().await?;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 66a1607801..4f4c397abe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -120,7 +120,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         lsn @ Lsn(_) => Some(lsn),
     };
     let current_logical_size = match timeline.get_current_logical_size() {
-        Ok(size) => Some(size),
+        Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
             None
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index df02f24239..2c22c6694d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -752,18 +752,22 @@ impl Timeline {
     ///
     /// The size could be lagging behind the actual number, in case
     /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
+        let mut is_exact = true;
         let size = current_size.size();
         if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
+            is_exact = false;
             self.try_spawn_size_init_task(init_lsn);
         }
 
-        Ok(size)
+        Ok((size, is_exact))
     }
 
     /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index a98126e683..3753807327 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -335,10 +335,11 @@ pub async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
+            let (timeline_logical_size, _) = timeline
+                .get_current_logical_size()
+                .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
-                current_timeline_size: timeline
-                    .get_current_logical_size()
-                    .context("Status update creation failed to get current logical size")?,
+                current_timeline_size: timeline_logical_size,
                 ps_writelsn: write_lsn,
                 ps_flushlsn: flush_lsn,
                 ps_applylsn: apply_lsn,
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index a3b3609153..ac9f163801 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -42,16 +42,28 @@ def metrics_handler(request: Request) -> Response:
         # >= 0 check here is to avoid race condition when we receive metrics before
         # remote_uploaded is updated
         "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+        # logical size may lag behind the actual size, so allow 0 here
+        "timeline_logical_size": lambda value: value >= 0,
     }
 
+    events_received = 0
     for event in events:
-        assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
+        check = checks.get(event["metric"])
+        # calm down mypy
+        if check is not None:
+            assert check(event["value"]), f"{event['metric']} isn't valid"
+            events_received += 1
 
     global first_request
     # check that all checks were sent
     # but only on the first request, because we don't send non-changed metrics
     if first_request:
-        assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+        # we may receive more metrics than we check,
+        # because there are two timelines
+        # and we may receive per-timeline metrics from both
+        # if the test was slow enough for these metrics to be collected
+        # -1 because that is ok to not receive timeline_logical_size
+        assert events_received >= len(checks) - 1
         first_request = False
 
     global num_metrics_received

From 7c7d225d9805d34cfe5071798e0ae6af4cf38df5 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 29 Dec 2022 16:08:21 +0200
Subject: [PATCH 166/167] add pageserver to new region see
 https://github.com/neondatabase/aws/pull/116

---
 .github/ansible/prod.us-west-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 7d6e49bf9c..9eb422a3ae 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -25,6 +25,8 @@ storage:
           ansible_host: i-0d9f6dfae0e1c780d 
         pageserver-1.us-west-2.aws.neon.tech:
           ansible_host: i-0c834be1dddba8b3f
+        pageserver-2.us-west-2.aws.neon.tech:
+          ansible_host: i-051642d372c0a4f32
 
     safekeepers:
       hosts:

From c700c7db2e89056fa53a89d59ca42a143c46cea7 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Mon, 26 Dec 2022 22:10:28 +0300
Subject: [PATCH 167/167] [proxy] Add more labels to the pricing metrics

---
 proxy/src/auth/backend.rs          |  45 ++-----
 proxy/src/auth/backend/console.rs  |  44 +++----
 proxy/src/auth/backend/link.rs     |   2 +-
 proxy/src/auth/backend/postgres.rs |  11 +-
 proxy/src/compute.rs               |   2 +-
 proxy/src/console.rs               |   5 +
 proxy/src/console/messages.rs      | 190 +++++++++++++++++++++++++++++
 proxy/src/main.rs                  |   1 +
 proxy/src/mgmt.rs                  | 100 +--------------
 proxy/src/proxy.rs                 |  18 +--
 proxy/src/proxy/tests.rs           |   2 +-
 test_runner/regress/test_proxy.py  |   6 +-
 12 files changed, 249 insertions(+), 177 deletions(-)
 create mode 100644 proxy/src/console.rs
 create mode 100644 proxy/src/console/messages.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4b937f017a..4adf0ed940 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -8,7 +8,9 @@ pub use console::{GetAuthInfoError, WakeComputeError};
 
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
-    compute, http, mgmt, stream, url,
+    compute,
+    console::messages::MetricsAuxInfo,
+    http, mgmt, stream, url,
     waiters::{self, Waiter, Waiters},
 };
 use once_cell::sync::Lazy;
@@ -126,25 +128,13 @@ pub struct AuthSuccess<T> {
     pub value: T,
 }
 
-impl<T> AuthSuccess<T> {
-    /// Very similar to [`std::option::Option::map`].
-    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
-    /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
-        AuthSuccess {
-            reported_auth_ok: self.reported_auth_ok,
-            value: f(self.value),
-        }
-    }
-}
-
 /// Info for establishing a connection to a compute node.
 /// This is what we get after auth succeeded, but not before!
 pub struct NodeInfo {
-    /// Project from [`auth::ClientCredentials`].
-    pub project: String,
     /// Compute node connection params.
     pub config: compute::ConnCfg,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 }
 
 impl BackendType<'_, ClientCredentials<'_>> {
@@ -172,37 +162,34 @@ impl BackendType<'_, ClientCredentials<'_>> {
         };
 
         // TODO: find a proper way to merge those very similar blocks.
-        let (mut config, payload) = match self {
+        let (mut node, payload) = match self {
             Console(endpoint, creds) if creds.project.is_none() => {
                 let payload = fetch_magic_payload.await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
-                let config = console::Api::new(endpoint, extra, &creds)
+                let node = console::Api::new(endpoint, extra, &creds)
                     .wake_compute()
                     .await?;
 
-                (config, payload)
+                (node, payload)
             }
             Postgres(endpoint, creds) if creds.project.is_none() => {
                 let payload = fetch_magic_payload.await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
-                let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
+                let node = postgres::Api::new(endpoint, &creds).wake_compute().await?;
 
-                (config, payload)
+                (node, payload)
             }
             _ => return Ok(None),
         };
 
-        config.password(payload.password);
+        node.config.password(payload.password);
         Ok(Some(AuthSuccess {
             reported_auth_ok: false,
-            value: NodeInfo {
-                project: payload.project,
-                config,
-            },
+            value: node,
         }))
     }
 
@@ -233,10 +220,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                 console::Api::new(&endpoint, extra, &creds)
                     .handle_user(client)
                     .await?
-                    .map(|config| NodeInfo {
-                        project: creds.project.unwrap().into_owned(),
-                        config,
-                    })
             }
             Postgres(endpoint, creds) => {
                 info!("performing mock authentication using a local postgres instance");
@@ -245,10 +228,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                 postgres::Api::new(&endpoint, &creds)
                     .handle_user(client)
                     .await?
-                    .map(|config| NodeInfo {
-                        project: creds.project.unwrap().into_owned(),
-                        config,
-                    })
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs
index 040870fc8e..b3e3fd0c10 100644
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -1,16 +1,16 @@
 //! Cloud API V2.
 
-use super::{AuthSuccess, ConsoleReqExtra};
+use super::{AuthSuccess, ConsoleReqExtra, NodeInfo};
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
+    console::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     error::{io_error, UserFacingError},
     http, sasl, scram,
     stream::PqStream,
 };
 use futures::TryFutureExt;
 use reqwest::StatusCode as HttpStatusCode;
-use serde::Deserialize;
 use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -136,24 +136,6 @@ impl UserFacingError for WakeComputeError {
     }
 }
 
-/// Console's response which holds client's auth secret.
-#[derive(Deserialize, Debug)]
-struct GetRoleSecret {
-    role_secret: Box<str>,
-}
-
-/// Console's response which holds compute node's `host:port` pair.
-#[derive(Deserialize, Debug)]
-struct WakeCompute {
-    address: Box<str>,
-}
-
-/// Console's error response with human-readable description.
-#[derive(Deserialize, Debug)]
-struct ConsoleError {
-    error: Box<str>,
-}
-
 /// Auth secret which is managed by the cloud.
 pub enum AuthInfo {
     /// Md5 hash of user's password.
@@ -194,7 +176,7 @@ impl<'a> Api<'a> {
     pub(super) async fn handle_user(
         &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
         handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
 }
@@ -238,7 +220,7 @@ impl Api<'_> {
     }
 
     /// Wake up the compute node and return the corresponding connection info.
-    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
         async {
             let request = self
@@ -269,7 +251,10 @@ impl Api<'_> {
                 .dbname(self.creds.dbname)
                 .user(self.creds.user);
 
-            Ok(config)
+            Ok(NodeInfo {
+                config,
+                aux: body.aux,
+            })
         }
         .map_err(crate::error::log_error)
         .instrument(info_span!("wake_compute", id = request_id))
@@ -284,11 +269,11 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
     endpoint: &'a Endpoint,
     get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
     wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
-) -> auth::Result<AuthSuccess<compute::ConnCfg>>
+) -> auth::Result<AuthSuccess<NodeInfo>>
 where
     Endpoint: AsRef<ClientCredentials<'a>>,
     GetAuthInfo: Future<Output = Result<Option<AuthInfo>, GetAuthInfoError>>,
-    WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
+    WakeCompute: Future<Output = Result<NodeInfo, WakeComputeError>>,
 {
     let creds = endpoint.as_ref();
 
@@ -325,19 +310,20 @@ where
         }
     };
 
-    let mut config = wake_compute(endpoint).await?;
+    let mut node = wake_compute(endpoint).await?;
     if let Some(keys) = scram_keys {
-        config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
+        use tokio_postgres::config::AuthKeys;
+        node.config.auth_keys(AuthKeys::ScramSha256(keys));
     }
 
     Ok(AuthSuccess {
         reported_auth_ok: false,
-        value: config,
+        value: node,
     })
 }
 
 /// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> Deserialize<'a>>(
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     response: reqwest::Response,
 ) -> Result<T, ApiError> {
     let status = response.status();
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 641519ac50..e16bbc70e4 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -86,8 +86,8 @@ pub async fn handle_user(
     Ok(AuthSuccess {
         reported_auth_ok: true,
         value: NodeInfo {
-            project: db_info.project,
             config,
+            aux: db_info.aux,
         },
     })
 }
diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs
index 8f16dc9fa8..260342f103 100644
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -2,7 +2,7 @@
 
 use super::{
     console::{self, AuthInfo, GetAuthInfoError, WakeComputeError},
-    AuthSuccess,
+    AuthSuccess, NodeInfo,
 };
 use crate::{
     auth::{self, ClientCredentials},
@@ -57,7 +57,7 @@ impl<'a> Api<'a> {
     pub(super) async fn handle_user(
         &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
         // We reuse user handling logic from a production module.
         console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
@@ -103,7 +103,7 @@ impl Api<'_> {
     }
 
     /// We don't need to wake anything locally, so we just return the connection info.
-    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let mut config = compute::ConnCfg::new();
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
@@ -111,7 +111,10 @@ impl Api<'_> {
             .dbname(self.creds.dbname)
             .user(self.creds.user);
 
-        Ok(config)
+        Ok(NodeInfo {
+            config,
+            aux: Default::default(),
+        })
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 71421a4a65..094db73061 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -43,7 +43,7 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[repr(transparent)]
-pub struct ConnCfg(pub tokio_postgres::Config);
+pub struct ConnCfg(Box<tokio_postgres::Config>);
 
 impl ConnCfg {
     /// Construct a new connection config.
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
new file mode 100644
index 0000000000..78f09ac9e1
--- /dev/null
+++ b/proxy/src/console.rs
@@ -0,0 +1,5 @@
+///! Various stuff for dealing with the Neon Console.
+///! Later we might move some API wrappers here.
+
+/// Payloads used in the console's APIs.
+pub mod messages;
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
new file mode 100644
index 0000000000..63a97069b8
--- /dev/null
+++ b/proxy/src/console/messages.rs
@@ -0,0 +1,190 @@
+use serde::Deserialize;
+use std::fmt;
+
+/// Generic error response with human-readable description.
+/// Note that we can't always present it to user as is.
+#[derive(Debug, Deserialize)]
+pub struct ConsoleError {
+    pub error: Box<str>,
+}
+
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/proxy_get_role_secret` API method.
+#[derive(Deserialize)]
+pub struct GetRoleSecret {
+    pub role_secret: Box<str>,
+}
+
+// Manually implement debug to omit sensitive info.
+impl fmt::Debug for GetRoleSecret {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
+    }
+}
+
+/// Response which holds compute node's `host:port` pair.
+/// Returned by the `/proxy_wake_compute` API method.
+#[derive(Debug, Deserialize)]
+pub struct WakeCompute {
+    pub address: Box<str>,
+    pub aux: MetricsAuxInfo,
+}
+
+/// Async response which concludes the link auth flow.
+/// Also known as `kickResponse` in the console.
+#[derive(Debug, Deserialize)]
+pub struct KickSession<'a> {
+    /// Session ID is assigned by the proxy.
+    pub session_id: &'a str,
+
+    /// Compute node connection params.
+    #[serde(deserialize_with = "KickSession::parse_db_info")]
+    pub result: DatabaseInfo,
+}
+
+impl KickSession<'_> {
+    fn parse_db_info<'de, D>(des: D) -> Result<DatabaseInfo, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        enum Wrapper {
+            // Currently, console only reports `Success`.
+            // `Failure(String)` used to be here... RIP.
+            Success(DatabaseInfo),
+        }
+
+        Wrapper::deserialize(des).map(|x| match x {
+            Wrapper::Success(info) => info,
+        })
+    }
+}
+
+/// Compute node connection params.
+#[derive(Deserialize)]
+pub struct DatabaseInfo {
+    pub host: String,
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    /// Console always provides a password, but it might
+    /// be inconvenient for debug with local PG instance.
+    pub password: Option<String>,
+    pub aux: MetricsAuxInfo,
+}
+
+// Manually implement debug to omit sensitive info.
+impl fmt::Debug for DatabaseInfo {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("dbname", &self.dbname)
+            .field("user", &self.user)
+            .finish_non_exhaustive()
+    }
+}
+
+/// Various labels for prometheus metrics.
+/// Also known as `ProxyMetricsAuxInfo` in the console.
+#[derive(Debug, Deserialize, Default)]
+pub struct MetricsAuxInfo {
+    pub endpoint_id: Box<str>,
+    pub project_id: Box<str>,
+    pub branch_id: Box<str>,
+}
+
+impl MetricsAuxInfo {
+    /// Definitions of labels for traffic metric.
+    pub const TRAFFIC_LABELS: &'static [&'static str] = &[
+        // Received (rx) / sent (tx).
+        "direction",
+        // ID of a project.
+        "project_id",
+        // ID of an endpoint within a project.
+        "endpoint_id",
+        // ID of a branch within a project (snapshot).
+        "branch_id",
+    ];
+
+    /// Values of labels for traffic metric.
+    // TODO: add more type safety (validate arity & positions).
+    pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
+        [
+            direction,
+            &self.project_id,
+            &self.endpoint_id,
+            &self.branch_id,
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    fn dummy_aux() -> serde_json::Value {
+        json!({
+            "endpoint_id": "endpoint",
+            "project_id": "project",
+            "branch_id": "branch",
+        })
+    }
+
+    #[test]
+    fn parse_kick_session() -> anyhow::Result<()> {
+        // This is what the console's kickResponse looks like.
+        let json = json!({
+            "session_id": "deadbeef",
+            "result": {
+                "Success": {
+                    "host": "localhost",
+                    "port": 5432,
+                    "dbname": "postgres",
+                    "user": "john_doe",
+                    "password": "password",
+                    "aux": dummy_aux(),
+                }
+            }
+        });
+        let _: KickSession = serde_json::from_str(&json.to_string())?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        // with password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "aux": dummy_aux(),
+        }))?;
+
+        // without password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "aux": dummy_aux(),
+        }))?;
+
+        // new field (forward compatibility)
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+            "N.E.W": "forward compatibility check",
+            "aux": dummy_aux(),
+        }))?;
+
+        Ok(())
+    }
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 2855d1f900..89ea9142a9 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -8,6 +8,7 @@ mod auth;
 mod cancellation;
 mod compute;
 mod config;
+mod console;
 mod error;
 mod http;
 mod mgmt;
diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs
index 23e10b5a9b..2e0a502e7f 100644
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -1,7 +1,9 @@
-use crate::auth;
+use crate::{
+    auth,
+    console::messages::{DatabaseInfo, KickSession},
+};
 use anyhow::Context;
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use serde::Deserialize;
 use std::{
     net::{TcpListener, TcpStream},
     thread,
@@ -50,59 +52,9 @@ fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
     pgbackend.run(&mut MgmtHandler)
 }
 
-/// Known as `kickResponse` in the console.
-#[derive(Debug, Deserialize)]
-struct PsqlSessionResponse {
-    session_id: String,
-    result: PsqlSessionResult,
-}
-
-#[derive(Debug, Deserialize)]
-enum PsqlSessionResult {
-    Success(DatabaseInfo),
-    Failure(String),
-}
-
 /// A message received by `mgmt` when a compute node is ready.
 pub type ComputeReady = Result<DatabaseInfo, String>;
 
-impl PsqlSessionResult {
-    fn into_compute_ready(self) -> ComputeReady {
-        match self {
-            Self::Success(db_info) => Ok(db_info),
-            Self::Failure(message) => Err(message),
-        }
-    }
-}
-
-/// Compute node connection params provided by the console.
-/// This struct and its parents are mgmt API implementation
-/// detail and thus should remain in this module.
-// TODO: restore deserialization tests from git history.
-#[derive(Deserialize)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    /// Console always provides a password, but it might
-    /// be inconvenient for debug with local PG instance.
-    pub password: Option<String>,
-    pub project: String,
-}
-
-// Manually implement debug to omit sensitive info.
-impl std::fmt::Debug for DatabaseInfo {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
-        fmt.debug_struct("DatabaseInfo")
-            .field("host", &self.host)
-            .field("port", &self.port)
-            .field("dbname", &self.dbname)
-            .field("user", &self.user)
-            .finish_non_exhaustive()
-    }
-}
-
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
 impl postgres_backend::Handler for MgmtHandler {
@@ -115,13 +67,13 @@ impl postgres_backend::Handler for MgmtHandler {
 }
 
 fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
-    let resp: PsqlSessionResponse = serde_json::from_str(query)?;
+    let resp: KickSession = serde_json::from_str(query)?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
     info!("got response: {:?}", resp.result);
 
-    match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
+    match auth::backend::notify(resp.session_id, Ok(resp.result)) {
         Ok(()) => {
             pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                 .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
@@ -135,43 +87,3 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
 
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn parse_db_info() -> anyhow::Result<()> {
-        // with password
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "password": "password",
-            "project": "hello_world",
-        }))?;
-
-        // without password
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "project": "hello_world",
-        }))?;
-
-        // new field (forward compatibility)
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "project": "hello_world",
-            "N.E.W": "forward compatibility check",
-        }))?;
-
-        Ok(())
-    }
-}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 713388c625..382f7cd918 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use pq_proto::{BeMessage as Be, *};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, info_span, Instrument};
@@ -39,12 +39,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "proxy_io_bytes_per_client",
         "Number of bytes sent/received between client and backend.",
-        &[
-            // Received (rx) / sent (tx).
-            "direction",
-            // Proxy can keep calling it `project` internally.
-            "endpoint_id"
-        ]
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
     )
     .unwrap()
 });
@@ -271,19 +266,16 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
 
         stream
             .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
-            .write_message(&BeMessage::ReadyForQuery)
+            .write_message(&Be::ReadyForQuery)
             .await?;
 
-        // TODO: add more identifiers.
-        let metric_id = node.project;
-
-        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
+        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("tx"));
         let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
         });
 
-        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
+        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("rx"));
         let mut db = MeasuredStream::new(db.stream, |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
             m_recv.inc_by(cnt as u64);
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 2f023844d0..ed429df421 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -140,7 +140,7 @@ async fn dummy_proxy(
     stream
         .write_message_noflush(&Be::AuthenticationOk)?
         .write_message_noflush(&Be::CLIENT_ENCODING)?
-        .write_message(&BeMessage::ReadyForQuery)
+        .write_message(&Be::ReadyForQuery)
         .await?;
 
     Ok(())
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index bcea4d970c..e13ba51f4b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -63,7 +63,11 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
                         "port": local_vanilla_pg.default_options["port"],
                         "dbname": local_vanilla_pg.default_options["dbname"],
                         "user": pg_user,
-                        "project": "irrelevant",
+                        "aux": {
+                            "project_id": "project",
+                            "endpoint_id": "endpoint",
+                            "branch_id": "branch",
+                        },
                     }
                 },
             }