From 3657a3c76e8b794997caf01baa80044626c82049 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 30 Nov 2023 17:33:54 +0100
Subject: [PATCH 001/202] Proxy fix metrics record (#5996)

## Problem

Some latency metrics are recorded in inconsistent way.

## Summary of changes

Make sure that everything is recorded in seconds.
---
 proxy/src/http.rs  | 2 +-
 proxy/src/proxy.rs | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 638705d3e9..09423eca77 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -95,7 +95,7 @@ impl Endpoint {
         let res = self.client.execute(request).await;
         CONSOLE_REQUEST_LATENCY
             .with_label_values(&[&path])
-            .observe(start.elapsed().as_micros() as f64);
+            .observe(start.elapsed().as_secs_f64());
         res
     }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2af2dd5562..c4bea13f7f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -117,7 +117,7 @@ pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
         // proxy_wake_compute/proxy_get_role_info
         &["request"],
         // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.2, 2.0, 16).unwrap(),
+        exponential_buckets(0.0002, 2.0, 16).unwrap(),
     )
     .unwrap()
 });
@@ -136,7 +136,7 @@ pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "semaphore_control_plane_token_acquire_seconds",
         "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.00005s = 3.28s
+        // largest bucket = 3^16 * 0.00005ms = 2.15s
         exponential_buckets(0.00005, 3.0, 16).unwrap(),
     )
     .unwrap()

From b451e75dc619bca87aa2e3b4a585aedd1526c6b5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 30 Nov 2023 19:31:03 +0200
Subject: [PATCH 002/202] test: include cmdline in captured output (#5977)

aiming for faster to understand a bunch of `.stdout` and `.stderr`
files, see example echo_1.stdout differences:

```
+# echo foobar abbacd
+
 foobar abbacd
```

it can be disabled and is disabled in this PR for some tests; use
`pg_bin.run_capture(..., with_command_header=False)` for that.

as a bonus this cleans up the echoed newlines from s3_scrubber
output which are also saved to file but echoed to test log.

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py    | 11 +++++--
 test_runner/fixtures/utils.py            | 17 +++++++++--
 test_runner/regress/test_fullbackup.py   | 11 ++++---
 test_runner/regress/test_import.py       | 39 ++++++++++++++++--------
 test_runner/regress/test_wal_acceptor.py |  7 ++---
 5 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 862aab84dc..188851c39f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1858,7 +1858,8 @@ class PgBin:
         command: List[str],
         env: Optional[Env] = None,
         cwd: Optional[str] = None,
-        **kwargs: Any,
+        with_command_header=True,
+        **popen_kwargs: Any,
     ) -> str:
         """
         Run one of the postgres binaries, with stderr and stdout redirected to a file.
@@ -1871,7 +1872,13 @@ class PgBin:
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
         base_path, _, _ = subprocess_capture(
-            self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs
+            self.log_dir,
+            command,
+            env=env,
+            cwd=cwd,
+            check=True,
+            with_command_header=with_command_header,
+            **popen_kwargs,
         )
         return base_path
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 1ec18b9f74..cda788b2a4 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -49,7 +49,8 @@ def subprocess_capture(
     echo_stdout=False,
     capture_stdout=False,
     timeout=None,
-    **kwargs: Any,
+    with_command_header=True,
+    **popen_kwargs: Any,
 ) -> Tuple[str, Optional[str], int]:
     """Run a process and bifurcate its output to files and the `log` logger
 
@@ -86,13 +87,23 @@ def subprocess_capture(
             self.captured = ""
 
         def run(self):
+            first = with_command_header
             for line in self.in_file:
+                if first:
+                    # do this only after receiving any input so that we can
+                    # keep deleting empty files, or leave it out completly if
+                    # it was unwanted (using the file as input later for example)
+                    first = False
+                    # prefix the files with the command line so that we can
+                    # later understand which file is for what command
+                    self.out_file.write((f"# {' '.join(cmd)}\n\n").encode("utf-8"))
+
                 # Only bother decoding if we are going to do something more than stream to a file
                 if self.echo or self.capture:
                     string = line.decode(encoding="utf-8", errors="replace")
 
                     if self.echo:
-                        log.info(string)
+                        log.info(string.strip())
 
                     if self.capture:
                         self.captured += string
@@ -107,7 +118,7 @@ def subprocess_capture(
 
                 p = subprocess.Popen(
                     cmd,
-                    **kwargs,
+                    **popen_kwargs,
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE,
                 )
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index 214f1f33a8..a456c06862 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -20,6 +20,7 @@ def test_fullbackup(
     pg_bin: PgBin,
     port_distributor: PortDistributor,
     pg_distrib_dir: Path,
+    test_output_dir: Path,
 ):
     env = neon_env_builder.init_start()
 
@@ -49,10 +50,12 @@ def test_fullbackup(
     restored_dir_path = env.repo_dir / "restored_datadir"
     os.mkdir(restored_dir_path, 0o750)
     query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
-    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
-    tar_output_file = result_basepath + ".stdout"
-    subprocess_capture(env.repo_dir, ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)])
+    tar_output_file = test_output_dir / "fullbackup.tar"
+    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
+    pg_bin.run_capture(cmd, env=psql_env)
+    subprocess_capture(
+        env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
+    )
 
     # HACK
     # fullbackup returns neon specific pg_control and first WAL segment
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index d357bd0ee4..8da5f1eec2 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,7 +163,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
 
-def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+def test_import_from_pageserver_small(
+    pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
@@ -177,7 +179,7 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
 
     num_rows = 3000
     lsn = _generate_data(num_rows, endpoint)
-    _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir)
+    _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
 
 
 @pytest.mark.timeout(1800)
@@ -185,7 +187,9 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
 # the test back after finding the failure cause.
 # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build")
 @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255")
-def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+def test_import_from_pageserver_multisegment(
+    pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
@@ -205,7 +209,9 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne
     log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
     assert logical_size > 1024**3  # = 1GB
 
-    tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir)
+    tar_output_file = _import(
+        num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
+    )
 
     # Check if the backup data contains multiple segment files
     cnt_seg_files = 0
@@ -246,7 +252,8 @@ def _import(
     pg_bin: PgBin,
     timeline: TimelineId,
     pg_distrib_dir: Path,
-) -> str:
+    test_output_dir: Path,
+) -> Path:
     """Test importing backup data to the pageserver.
 
     Args:
@@ -263,9 +270,9 @@ def _import(
 
     # Get a fullbackup from pageserver
     query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
-    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
-    tar_output_file = result_basepath + ".stdout"
+    tar_output_file = test_output_dir / "fullbackup.tar"
+    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
+    pg_bin.run_capture(cmd, env=psql_env)
 
     # Stop the first pageserver instance, erase all its data
     env.endpoints.stop_all()
@@ -299,7 +306,7 @@ def _import(
             "--base-lsn",
             str(lsn),
             "--base-tarfile",
-            os.path.join(tar_output_file),
+            str(tar_output_file),
             "--pg-version",
             env.pg_version,
         ]
@@ -315,9 +322,17 @@ def _import(
 
     # Take another fullbackup
     query = f"fullbackup { tenant} {timeline} {lsn}"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
-    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
-    new_tar_output_file = result_basepath + ".stdout"
+    new_tar_output_file = test_output_dir / "fullbackup-new.tar"
+    cmd = [
+        "psql",
+        "--no-psqlrc",
+        env.pageserver.connstr(),
+        "-c",
+        query,
+        "-o",
+        str(new_tar_output_file),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
 
     # Check it's the same as the first fullbackup
     # TODO pageserver should be checking checksum
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 05c60eb102..b7eaaf39bc 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1,6 +1,5 @@
 import filecmp
 import os
-import pathlib
 import random
 import shutil
 import signal
@@ -639,7 +638,7 @@ class ProposerPostgres(PgProtocol):
     def __init__(
         self,
         pgdata_dir: str,
-        pg_bin,
+        pg_bin: PgBin,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         listen_addr: str,
@@ -665,7 +664,7 @@ class ProposerPostgres(PgProtocol):
     def create_dir_config(self, safekeepers: str):
         """Create dir and config for running --sync-safekeepers"""
 
-        pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True)
+        Path(self.pg_data_dir_path()).mkdir(exist_ok=True)
         with open(self.config_file_path(), "w") as f:
             cfg = [
                 "synchronous_standby_names = 'walproposer'\n",
@@ -691,7 +690,7 @@ class ProposerPostgres(PgProtocol):
             "PGDATA": self.pg_data_dir_path(),
         }
 
-        basepath = self.pg_bin.run_capture(command, env)
+        basepath = self.pg_bin.run_capture(command, env, with_command_header=False)
 
         log.info(f"postgres --sync-safekeepers output: {basepath}")
 

From f39fca0049c36cff5ed2c4b890b08f86fa56c15b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 Nov 2023 20:52:30 +0000
Subject: [PATCH 003/202] proxy: chore: replace strings with SmolStr (#5786)

## Problem

no problem

## Summary of changes

replaces boxstr with arcstr as it's cheaper to clone. mild perf
improvement.

probably should look into other smallstring optimsations tbh, they will
likely be even better. The longest endpoint name I was able to construct
is something like `ep-weathered-wildflower-12345678` which is 32 bytes.
Most string optimisations top out at 23 bytes
---
 Cargo.lock                            | 10 ++++++++++
 Cargo.toml                            |  1 +
 proxy/Cargo.toml                      |  1 +
 proxy/src/auth/backend/link.rs        |  2 +-
 proxy/src/bin/pg_sni_router.rs        |  2 +-
 proxy/src/console/messages.rs         |  9 +++++----
 proxy/src/console/provider.rs         |  2 +-
 proxy/src/console/provider/neon.rs    |  2 +-
 proxy/src/proxy.rs                    |  8 ++++----
 proxy/src/serverless/conn_pool.rs     | 25 +++++++++++++------------
 proxy/src/serverless/sql_over_http.rs | 10 +++++-----
 proxy/src/usage_metrics.rs            |  9 +++++----
 12 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6546590f6c..5639665758 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3643,6 +3643,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "smol_str",
  "socket2 0.5.3",
  "sync_wrapper",
  "task-local-extensions",
@@ -4709,6 +4710,15 @@ version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
 
+[[package]]
+name = "smol_str"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "socket2"
 version = "0.4.9"
diff --git a/Cargo.toml b/Cargo.toml
index cbcb25359d..ba8b49c0e0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -132,6 +132,7 @@ serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
+smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0822718bae..48c8604d86 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -69,6 +69,7 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
+smol_str.workspace = true
 
 workspace_hack.workspace = true
 tokio-util.workspace = true
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index da43cf11c4..3a77d7e5ca 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -106,7 +106,7 @@ pub(super) async fn authenticate(
         reported_auth_ok: true,
         value: NodeInfo {
             config,
-            aux: db_info.aux.into(),
+            aux: db_info.aux,
             allow_self_signed_compute: false, // caller may override
         },
     })
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 2b859fc2db..bedbdbcc83 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -284,5 +284,5 @@ async fn handle_client(
     let client = tokio::net::TcpStream::connect(destination).await?;
 
     let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
+    proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
 }
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index e5f1615b14..837379b21f 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,4 +1,5 @@
 use serde::Deserialize;
+use smol_str::SmolStr;
 use std::fmt;
 
 /// Generic error response with human-readable description.
@@ -88,11 +89,11 @@ impl fmt::Debug for DatabaseInfo {
 
 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
-#[derive(Debug, Deserialize, Default)]
+#[derive(Debug, Deserialize, Clone, Default)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: Box<str>,
-    pub project_id: Box<str>,
-    pub branch_id: Box<str>,
+    pub endpoint_id: SmolStr,
+    pub project_id: SmolStr,
+    pub branch_id: SmolStr,
 }
 
 impl MetricsAuxInfo {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index a525de8e53..e735b9f66c 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -229,7 +229,7 @@ pub struct NodeInfo {
     pub config: compute::ConnCfg,
 
     /// Labels for proxy's metrics.
-    pub aux: Arc<MetricsAuxInfo>,
+    pub aux: MetricsAuxInfo,
 
     /// Whether we should accept self-signed certificates (for testing)
     pub allow_self_signed_compute: bool,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 117d0ec190..7828a7d7e4 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -144,7 +144,7 @@ impl Api {
 
             let node = NodeInfo {
                 config,
-                aux: body.aux.into(),
+                aux: body.aux,
                 allow_self_signed_compute: false,
             };
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index c4bea13f7f..36d01f9acc 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -877,11 +877,11 @@ async fn prepare_client_connection(
 pub async fn proxy_pass(
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: &MetricsAuxInfo,
+    aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
     let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.to_string(),
-        branch_id: aux.branch_id.to_string(),
+        endpoint_id: aux.endpoint_id.clone(),
+        branch_id: aux.branch_id.clone(),
     });
 
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
@@ -1032,7 +1032,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         // immediately after opening the connection.
         let (stream, read_buf) = stream.into_inner();
         node.stream.write_all(&read_buf).await?;
-        proxy_pass(stream, node.stream, &aux).await
+        proxy_pass(stream, node.stream, aux).await
     }
 }
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 2072cadc3a..ca7a9ad0a0 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -8,6 +8,7 @@ use pbkdf2::{
     Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
+use smol_str::SmolStr;
 use std::{collections::HashMap, net::SocketAddr, sync::Arc};
 use std::{
     fmt,
@@ -41,16 +42,16 @@ const MAX_CONNS_PER_ENDPOINT: usize = 20;
 
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
-    pub username: String,
-    pub dbname: String,
-    pub hostname: String,
-    pub password: String,
-    pub options: Option<String>,
+    pub username: SmolStr,
+    pub dbname: SmolStr,
+    pub hostname: SmolStr,
+    pub password: SmolStr,
+    pub options: Option<SmolStr>,
 }
 
 impl ConnInfo {
     // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (String, String) {
+    pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
         (self.dbname.clone(), self.username.clone())
     }
 }
@@ -70,7 +71,7 @@ struct ConnPoolEntry {
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(String, String), DbUserConnPool>,
+    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
     total_conns: usize,
 }
 
@@ -95,7 +96,7 @@ pub struct GlobalConnPool {
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
 
     /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
@@ -327,7 +328,7 @@ impl GlobalConnPool {
         Ok(())
     }
 
-    fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
@@ -468,7 +469,7 @@ async fn connect_to_compute_once(
 
     let (client, mut connection) = config
         .user(&conn_info.username)
-        .password(&conn_info.password)
+        .password(&*conn_info.password)
         .dbname(&conn_info.dbname)
         .connect_timeout(timeout)
         .connect(tokio_postgres::NoTls)
@@ -482,8 +483,8 @@ async fn connect_to_compute_once(
         info!(%conn_info, %session, "new connection");
     });
     let ids = Ids {
-        endpoint_id: node_info.aux.endpoint_id.to_string(),
-        branch_id: node_info.aux.branch_id.to_string(),
+        endpoint_id: node_info.aux.endpoint_id.clone(),
+        branch_id: node_info.aux.branch_id.clone(),
     };
 
     tokio::spawn(
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 25b96668de..6c337a837c 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -182,16 +182,16 @@ fn get_conn_info(
 
     for (key, value) in pairs {
         if key == "options" {
-            options = Some(value.to_string());
+            options = Some(value.into());
             break;
         }
     }
 
     Ok(ConnInfo {
-        username: username.to_owned(),
-        dbname: dbname.to_owned(),
-        hostname: hostname.to_owned(),
-        password: password.to_owned(),
+        username: username.into(),
+        dbname: dbname.into(),
+        hostname: hostname.into(),
+        password: password.into(),
         options,
     })
 }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 180b5f7199..789a4c680c 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -6,6 +6,7 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S
 use dashmap::{mapref::entry::Entry, DashMap};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
+use smol_str::SmolStr;
 use std::{
     convert::Infallible,
     sync::{
@@ -29,8 +30,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: String,
-    pub branch_id: String,
+    pub endpoint_id: SmolStr,
+    pub branch_id: SmolStr,
 }
 
 #[derive(Debug)]
@@ -290,8 +291,8 @@ mod tests {
 
         // register a new counter
         let counter = metrics.register(Ids {
-            endpoint_id: "e1".to_string(),
-            branch_id: "b1".to_string(),
+            endpoint_id: "e1".into(),
+            branch_id: "b1".into(),
         });
 
         // the counter should be observed despite 0 egress

From 384277354607a98312f61c5e23e053ac8cabbfd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Nov 2023 22:01:25 +0100
Subject: [PATCH 004/202] Correct RFC number for Pageserver WAL DR RFC (#5997)

When I opened #5248, 27 was an unused RFC number. Since then, two RFCs
have been merged, so now 27 is taken. 29 is free though, so move it
there.
---
 ...saster-recovery.md => 029-pageserver-wal-disaster-recovery.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/rfcs/{027-pageserver-wal-disaster-recovery.md => 029-pageserver-wal-disaster-recovery.md} (100%)

diff --git a/docs/rfcs/027-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
similarity index 100%
rename from docs/rfcs/027-pageserver-wal-disaster-recovery.md
rename to docs/rfcs/029-pageserver-wal-disaster-recovery.md

From b71b8ecfc2f557773cc6d1221ba5255a03a25e8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Nov 2023 22:32:04 +0100
Subject: [PATCH 005/202] Add existing_initdb_timeline_id param to timeline
 creation (#5912)

This PR adds an `existing_initdb_timeline_id` option to timeline
creation APIs, taking an optional timeline ID.

Follow-up of  #5390.

If the `existing_initdb_timeline_id` option is specified via the HTTP
API, the pageserver downloads the existing initdb archive from the given
timeline ID and extracts it, instead of running initdb itself.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 control_plane/src/bin/neon_local.rs           |  3 +
 control_plane/src/pageserver.rs               |  2 +
 libs/pageserver_api/src/models.rs             |  2 +
 pageserver/src/http/openapi_spec.yml          |  3 +
 pageserver/src/http/routes.rs                 |  1 +
 pageserver/src/import_datadir.rs              | 13 ++-
 pageserver/src/tenant.rs                      | 79 +++++++++++++------
 .../src/tenant/remote_timeline_client.rs      | 13 ++-
 .../tenant/remote_timeline_client/download.rs | 75 ++++++++++++++++--
 pageserver/src/walingest.rs                   |  2 +-
 test_runner/fixtures/pageserver/http.py       |  4 +
 test_runner/fixtures/pageserver/utils.py      | 25 ++++--
 test_runner/regress/test_remote_storage.py    |  7 +-
 test_runner/regress/test_tenant_delete.py     |  2 +
 test_runner/regress/test_tenants.py           |  9 ++-
 test_runner/regress/test_timeline_delete.py   |  8 +-
 test_runner/regress/test_wal_restore.py       | 41 ++++++++++
 17 files changed, 245 insertions(+), 44 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f7442c02c7..8d53a6a658 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -415,6 +415,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                 None,
                 None,
                 Some(pg_version),
+                None,
             )?;
             let new_timeline_id = timeline_info.timeline_id;
             let last_record_lsn = timeline_info.last_record_lsn;
@@ -495,6 +496,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                 None,
                 None,
                 Some(pg_version),
+                None,
             )?;
             let new_timeline_id = timeline_info.timeline_id;
 
@@ -582,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                 start_lsn,
                 Some(ancestor_timeline_id),
                 None,
+                None,
             )?;
             let new_timeline_id = timeline_info.timeline_id;
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 0a55c90e8f..96a41874fd 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -565,6 +565,7 @@ impl PageServerNode {
         ancestor_start_lsn: Option<Lsn>,
         ancestor_timeline_id: Option<TimelineId>,
         pg_version: Option<u32>,
+        existing_initdb_timeline_id: Option<TimelineId>,
     ) -> anyhow::Result<TimelineInfo> {
         // If timeline ID was not specified, generate one
         let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -578,6 +579,7 @@ impl PageServerNode {
             ancestor_start_lsn,
             ancestor_timeline_id,
             pg_version,
+            existing_initdb_timeline_id,
         })
         .send()?
         .error_from_body()?
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 60c508037e..e6236f91e3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -179,6 +179,8 @@ pub struct TimelineCreateRequest {
     #[serde(default)]
     pub ancestor_timeline_id: Option<TimelineId>,
     #[serde(default)]
+    pub existing_initdb_timeline_id: Option<TimelineId>,
+    #[serde(default)]
     pub ancestor_start_lsn: Option<Lsn>,
     pub pg_version: Option<u32>,
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 2e418f4d8f..237109abfe 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1028,6 +1028,9 @@ paths:
                   format: hex
                 pg_version:
                   type: integer
+                existing_initdb_timeline_id:
+                  type: string
+                  format: hex
       responses:
         "201":
           description: TimelineInfo
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 37159be95c..7ab56ccab2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -441,6 +441,7 @@ async fn timeline_create_handler(
             request_data.ancestor_timeline_id.map(TimelineId::from),
             request_data.ancestor_start_lsn,
             request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            request_data.existing_initdb_timeline_id,
             state.broker_client.clone(),
             &ctx,
         )
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 770458e02e..452cd73f76 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -7,12 +7,13 @@ use std::pin::Pin;
 use std::task::{self, Poll};
 
 use anyhow::{bail, ensure, Context, Result};
+use async_compression::tokio::bufread::ZstdDecoder;
 use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use nix::NixPath;
-use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
@@ -732,3 +733,13 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
     }
     Ok(compressed.buf)
 }
+
+pub async fn extract_tar_zst(
+    pgdata_path: &Utf8Path,
+    tar_zst: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let tar = Box::pin(ZstdDecoder::new(tar_zst));
+    let mut archive = Archive::new(tar);
+    archive.unpack(pgdata_path).await?;
+    Ok(())
+}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 08066a612d..bf063664b4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -24,6 +24,7 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
+use tokio::io::BufReader;
 use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
@@ -1558,12 +1559,14 @@ impl Tenant {
     ///
     /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
     /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
+    #[allow(clippy::too_many_arguments)]
     pub async fn create_timeline(
         &self,
         new_timeline_id: TimelineId,
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        load_existing_initdb: Option<TimelineId>,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
@@ -1638,7 +1641,7 @@ impl Tenant {
                     .await?
             }
             None => {
-                self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
+                self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
                     .await?
             }
         };
@@ -2951,6 +2954,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         pg_version: u32,
+        load_existing_initdb: Option<TimelineId>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_uninit_mark = {
@@ -2973,8 +2977,6 @@ impl Tenant {
                 format!("Failed to remove already existing initdb directory: {pgdata_path}")
             })?;
         }
-        // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
-        run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
         scopeguard::defer! {
             if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -2982,31 +2984,58 @@ impl Tenant {
                 error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
             }
         }
-        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
+        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
+            let Some(storage) = &self.remote_storage else {
+                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
+            };
+            let (initdb_tar_zst_path, initdb_tar_zst) =
+                self::remote_timeline_client::download_initdb_tar_zst(
+                    self.conf,
+                    storage,
+                    &self.tenant_shard_id,
+                    &existing_initdb_timeline_id,
+                )
+                .await
+                .context("download initdb tar")?;
+            let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
+            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
+                .await
+                .context("extract initdb tar")?;
 
-        // Upload the created data dir to S3
-        if let Some(storage) = &self.remote_storage {
-            let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
-            let pgdata_zstd = Bytes::from(pgdata_zstd);
-            backoff::retry(
-                || async {
-                    self::remote_timeline_client::upload_initdb_dir(
-                        storage,
-                        &self.tenant_shard_id.tenant_id,
-                        &timeline_id,
-                        pgdata_zstd.clone(),
-                    )
+            if initdb_tar_zst_path.exists() {
+                tokio::fs::remove_file(&initdb_tar_zst_path)
                     .await
-                },
-                |_| false,
-                3,
-                u32::MAX,
-                "persist_initdb_tar_zst",
-                // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-                backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
-            )
-            .await?;
+                    .context("tempfile removal")?;
+            }
+        } else {
+            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
+            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
+
+            // Upload the created data dir to S3
+            if let Some(storage) = &self.remote_storage {
+                let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
+                let pgdata_zstd = Bytes::from(pgdata_zstd);
+                backoff::retry(
+                    || async {
+                        self::remote_timeline_client::upload_initdb_dir(
+                            storage,
+                            &self.tenant_shard_id.tenant_id,
+                            &timeline_id,
+                            pgdata_zstd.clone(),
+                        )
+                        .await
+                    },
+                    |_| false,
+                    3,
+                    u32::MAX,
+                    "persist_initdb_tar_zst",
+                    // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+                    backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+                )
+                .await?;
+            }
         }
+        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
 
         // Import the contents of the data directory at the initial checkpoint
         // LSN, and any WAL after that.
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 183ee19a40..5b649a420c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -188,6 +188,7 @@ use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
+pub(crate) use download::download_initdb_tar_zst;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -1077,7 +1078,17 @@ impl RemoteTimelineClient {
 
         let remaining_layers: Vec<RemotePath> = remaining
             .into_iter()
-            .filter(|p| p!= &latest_index)
+            .filter(|p| {
+                if p == &latest_index {
+                    return false;
+                }
+                if let Some(name) = p.object_name() {
+                    if name == INITDB_PATH {
+                        return false;
+                    }
+                }
+                true
+            })
             .inspect(|path| {
                 if let Some(name) = path.object_name() {
                     info!(%name, "deleting a file not referenced from index_part.json");
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 1e9dcfe76a..3b5fe4b207 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,11 +8,12 @@ use std::future::Future;
 use std::time::Duration;
 
 use anyhow::{anyhow, Context};
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
-use tokio::fs;
-use tokio::io::AsyncWriteExt;
+use tokio::fs::{self, File, OpenOptions};
+use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
+use tracing::warn;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
@@ -20,14 +21,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
+use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;
 
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
-    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES,
+    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -374,6 +376,69 @@ pub(super) async fn download_index_part(
     }
 }
 
+pub(crate) async fn download_initdb_tar_zst(
+    conf: &'static PageServerConf,
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+) -> Result<(Utf8PathBuf, File), DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
+    let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
+
+    let timeline_path = conf.timelines_path(tenant_shard_id);
+
+    if !timeline_path.exists() {
+        tokio::fs::create_dir_all(&timeline_path)
+            .await
+            .with_context(|| format!("timeline dir creation {timeline_path}"))
+            .map_err(DownloadError::Other)?;
+    }
+    let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
+
+    let file = download_retry(
+        || async {
+            let mut file = OpenOptions::new()
+                .create(true)
+                .truncate(true)
+                .read(true)
+                .write(true)
+                .open(&temp_path)
+                .await
+                .with_context(|| format!("tempfile creation {temp_path}"))
+                .map_err(DownloadError::Other)?;
+
+            let mut download = storage.download(&remote_path).await?;
+
+            tokio::io::copy(&mut download.download_stream, &mut file)
+                .await
+                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            file.seek(std::io::SeekFrom::Start(0))
+                .await
+                .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            Ok(file)
+        },
+        &format!("download {remote_path}"),
+    )
+    .await
+    .map_err(|e| {
+        if temp_path.exists() {
+            // Do a best-effort attempt at deleting the temporary file upon encountering an error.
+            // We don't have async here nor do we want to pile on any extra errors.
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                warn!("error deleting temporary file {temp_path}: {e}");
+            }
+        }
+        e
+    })?;
+
+    Ok((temp_path, file))
+}
+
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 4189200d5c..d6e01d01be 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -2115,7 +2115,7 @@ mod tests {
             .load()
             .await;
         let tline = tenant
-            .bootstrap_timeline(TIMELINE_ID, pg_version, &ctx)
+            .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
             .await
             .unwrap();
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 0e00d4a7de..76aa40122f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -362,12 +362,16 @@ class PageserverHttpClient(requests.Session):
         new_timeline_id: TimelineId,
         ancestor_timeline_id: Optional[TimelineId] = None,
         ancestor_start_lsn: Optional[Lsn] = None,
+        existing_initdb_timeline_id: Optional[TimelineId] = None,
         **kwargs,
     ) -> Dict[Any, Any]:
         body: Dict[str, Any] = {
             "new_timeline_id": str(new_timeline_id),
             "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
             "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
+            "existing_initdb_timeline_id": str(existing_initdb_timeline_id)
+            if existing_initdb_timeline_id
+            else None,
         }
         if pg_version != PgVersion.NOT_SET:
             body["pg_version"] = int(pg_version)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 007ff387f4..e7b78cfb9a 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,7 +1,7 @@
 import time
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
-from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef
+from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
 
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
@@ -235,10 +235,14 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder
 
 
-def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+def assert_prefix_empty(
+    neon_env_builder: "NeonEnvBuilder",
+    prefix: Optional[str] = None,
+    allowed_postfix: Optional[str] = None,
+):
     response = list_prefix(neon_env_builder, prefix)
     keys = response["KeyCount"]
-    objects = response.get("Contents", [])
+    objects: List[ObjectTypeDef] = response.get("Contents", [])
     common_prefixes = response.get("CommonPrefixes", [])
 
     remote_storage = neon_env_builder.pageserver_remote_storage
@@ -261,7 +265,18 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
                 f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
             )
 
-    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    filtered_count = 0
+    if allowed_postfix is None:
+        filtered_count = len(objects)
+    else:
+        for _obj in objects:
+            key: str = str(response.get("Key", []))
+            if not (allowed_postfix.endswith(key)):
+                filtered_count += 1
+
+    assert (
+        filtered_count == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
 
 
 def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index ccf28cae8c..9c2bb2db11 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -603,7 +603,12 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
     remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id)
 
-    assert not list(remote_timeline_path.iterdir())
+    filtered = [
+        path
+        for path in remote_timeline_path.iterdir()
+        if not (path.name.endswith("initdb.tar.zst"))
+    ]
+    assert len(filtered) == 0
 
     # timeline deletion should kill ongoing uploads, so, the metric will be gone
     assert get_queued_count(file_kind="index", op_kind="upload") is None
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 167c8355cc..9bafa60d18 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -197,6 +197,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # So by ignoring these instead of waiting for empty upload queue
             # we execute more distinct code paths.
             '.*stopping left-over name="remote upload".*',
+            ".*Failed to load index_part from remote storage, failed creation?.*",
         ]
     )
 
@@ -285,6 +286,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
                     str(tenant_id),
                 )
             ),
+            allowed_postfix="initdb.tar.zst",
         )
 
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 090d586721..a31f410f94 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -290,10 +290,13 @@ def test_pageserver_with_empty_tenants(
 
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.append(
-        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
+            ".*Failed to load index_part from remote storage, failed creation?.*",
+            ".*load failed.*list timelines directory.*",
+        ]
     )
-    env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")
 
     client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 2e1fcd38fe..51ad971cdb 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -230,6 +230,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
     # In one of the branches we poll for tenant to become active. Polls can generate this log message:
     env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
+    env.pageserver.allowed_errors.append(
+        ".*Failed to load index_part from remote storage, failed creation?.*"
+    )
 
     ps_http.configure_failpoints((failpoint, "return"))
 
@@ -308,8 +311,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         )
 
     timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
-    # Check local is empty
-    assert not timeline_dir.exists()
+    if failpoint != "timeline-delete-after-index-delete":
+        # Check local is empty
+        assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
     # Check no delete mark present
     assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
 
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index b039b36255..4a9ffeee4b 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,6 +1,7 @@
 import sys
 import tarfile
 import tempfile
+import time
 from pathlib import Path
 
 import pytest
@@ -125,3 +126,43 @@ def test_wal_restore_initdb(
         )
         log.info(f"original lsn: {original_lsn}, restored lsn: {restored_lsn}")
         assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
+
+
+def test_wal_restore_http(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("create table t as select generate_series(1,300000)")
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps_client = env.pageserver.http_client()
+
+    # shut down the endpoint and delete the timeline from the pageserver
+    endpoint.stop()
+
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    test_output_dir / "initdb.tar.zst"
+
+    (env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst")
+
+    ps_client.timeline_delete(tenant_id, timeline_id)
+    time.sleep(2)
+
+    # verify that it is indeed deleted
+    # TODO
+
+    # issue the restoration command
+    ps_client.timeline_create(
+        tenant_id=tenant_id,
+        new_timeline_id=timeline_id,
+        existing_initdb_timeline_id=timeline_id,
+        pg_version=env.pg_version,
+    )
+
+    # the table is back now!
+    restored = env.endpoints.create_start("main")
+    assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]

From f784e59b12a045e8a38613fe384315336e8f75ce Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Fri, 1 Dec 2023 00:12:52 +0100
Subject: [PATCH 006/202] Update timescaledb to 2.13.0 (#5975)

TimescaleDB has released 2.13.0. This version is compatible with
Postgres16
---
 Dockerfile.compute-node | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 8109091e76..425f163e8b 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,18 +387,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export TIMESCALEDB_VERSION=2.10.1 \
-        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
-        ;; \
-      *) \
-        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
     apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
-    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
+    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
     mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
     ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \

From 1ce1c82d78676257d007bbae667109cc2ab1a634 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 1 Dec 2023 11:58:06 +0100
Subject: [PATCH 007/202] Clean up local state if index_part.json request gives
 404 (#6009)

If `index_part.json` is (verifiably) not present on remote storage, we
should regard the timeline as inexistent. This lets `clean_up_timelines`
purge the partial local disk state, which is important in the case of
incomplete creations leaving behind state that hinders retries. For
incomplete deletions, we also want the timeline's local disk content be
gone completely.

The PR removes the allowed warnings added by #5390 and #5912, as we now
are only supposed to issue info level messages. It also adds a
reproducer for #6007, by parametrizing the
`test_timeline_init_break_before_checkpoint_recreate` test added by
#5390. If one reverts the .rs changes, the "cannot create its uninit
mark file" log line occurs once one comments out the failing checks for
the local disk state being actually empty.

Closes #6007

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant.rs                    | 25 +++++++++++-----
 test_runner/regress/test_broken_timeline.py | 33 +++++++++++++--------
 test_runner/regress/test_tenant_delete.py   |  1 -
 test_runner/regress/test_tenants.py         |  1 -
 test_runner/regress/test_timeline_delete.py | 10 +++----
 5 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bf063664b4..12610b1c89 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -796,20 +796,31 @@ impl Tenant {
         let mut timeline_ancestors = HashMap::new();
         let mut existent_timelines = HashSet::new();
         for (timeline_id, preload) in preload.timelines {
-            // In this context a timeline "exists" if it has any content in remote storage: this will
-            // be our cue to not delete any corresponding local directory
-            existent_timelines.insert(timeline_id);
-
             let index_part = match preload.index_part {
                 Ok(i) => {
                     debug!("remote index part exists for timeline {timeline_id}");
+                    // We found index_part on the remote, this is the standard case.
+                    existent_timelines.insert(timeline_id);
                     i
                 }
+                Err(DownloadError::NotFound) => {
+                    // There is no index_part on the remote. We only get here
+                    // if there is some prefix for the timeline in the remote storage.
+                    // This can e.g. be the initdb.tar.zst archive, maybe a
+                    // remnant from a prior incomplete creation or deletion attempt.
+                    // Delete the local directory as the deciding criterion for a
+                    // timeline's existence is presence of index_part.
+                    info!(%timeline_id, "index_part not found on remote");
+                    continue;
+                }
                 Err(e) => {
-                    // Timeline creation is not atomic: we might upload a layer but no index_part.  We expect
-                    // that the creation will be retried by the control plane and eventually result in
-                    // a valid loadable state.
+                    // Some (possibly ephemeral) error happened during index_part download.
+                    // Pretend the timeline exists to not delete the timeline directory,
+                    // as it might be a temporary issue and we don't want to re-download
+                    // everything after it resolves.
                     warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
+
+                    existent_timelines.insert(timeline_id);
                     continue;
                 }
             };
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 4911fc09d6..84a322039a 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -114,7 +114,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
         [
             ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
-            ".*Failed to load index_part from remote storage, failed creation?.*",
         ]
     )
 
@@ -144,8 +143,13 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
     ), "pageserver should clean its temp timeline files on timeline creation failure"
 
 
-def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
+@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
+def test_timeline_init_break_before_checkpoint_recreate(
+    neon_env_builder: NeonEnvBuilder, exit_or_return: str
+):
+    env = neon_env_builder.init_configs()
+    env.start()
     pageserver_http = env.pageserver.http_client()
 
     env.pageserver.allowed_errors.extend(
@@ -156,6 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn
         ]
     )
 
+    pageserver_http.tenant_create(env.initial_tenant)
     tenant_id = env.initial_tenant
 
     timelines_dir = env.pageserver.timeline_dir(tenant_id)
@@ -166,13 +171,17 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn
     timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
 
     # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
-    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
-    with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
-        _ = env.neon_cli.create_timeline(
-            "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
-        )
+    failpoint = "before-checkpoint-new-timeline"
+    pattern = failpoint
+    if exit_or_return == "exit":
+        # in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
+        pattern = "Connection aborted."
 
-    # Restart the page server
+    pageserver_http.configure_failpoints((failpoint, exit_or_return))
+    with pytest.raises(Exception, match=pattern):
+        _ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
+
+    # Restart the page server (with the failpoint disabled)
     env.pageserver.restart(immediate=True)
 
     # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
@@ -186,11 +195,9 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn
         timeline_dirs == initial_timeline_dirs
     ), "pageserver should clean its temp timeline files on timeline creation failure"
 
-    # Disable the failpoint again
-    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off"))
     # creating the branch should have worked now
-    new_timeline_id = env.neon_cli.create_timeline(
-        "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
+    new_timeline_id = TimelineId(
+        pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
     )
 
     assert timeline_id == new_timeline_id
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 9bafa60d18..89c474286a 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -197,7 +197,6 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # So by ignoring these instead of waiting for empty upload queue
             # we execute more distinct code paths.
             '.*stopping left-over name="remote upload".*',
-            ".*Failed to load index_part from remote storage, failed creation?.*",
         ]
     )
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index a31f410f94..c81be41530 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -293,7 +293,6 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.extend(
         [
             ".*marking .* as locally complete, while it doesnt exist in remote index.*",
-            ".*Failed to load index_part from remote storage, failed creation?.*",
             ".*load failed.*list timelines directory.*",
         ]
     )
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 51ad971cdb..b1a2755394 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -230,9 +230,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
     # In one of the branches we poll for tenant to become active. Polls can generate this log message:
     env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
-    env.pageserver.allowed_errors.append(
-        ".*Failed to load index_part from remote storage, failed creation?.*"
-    )
 
     ps_http.configure_failpoints((failpoint, "return"))
 
@@ -311,9 +308,10 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         )
 
     timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
-    if failpoint != "timeline-delete-after-index-delete":
-        # Check local is empty
-        assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
+
+    # Check local is empty
+    assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
+
     # Check no delete mark present
     assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
 

From 1c88824ed0e6bfbce02fa92e13ca91d5ab0e37b3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 1 Dec 2023 12:52:59 +0100
Subject: [PATCH 008/202] initial logical size calculation: add a bunch of
 metrics (#5995)

These will help us answer questions such as:
- when & at what do calculations get started after PS restart?
- how often is the api to get current incrementally-computed logical
  size called, and does it return Exact vs Approximate?

I'd also be interested in a histogram of how much wall clock
time size calculations take, but, I don't know good bucket sizes,
and, logging it would introduce yet another per-timeline log
message during startup; don't think that's worth it just yet.

Context

- https://neondb.slack.com/archives/C033RQ5SPDH/p1701197668789769
- https://github.com/neondatabase/neon/issues/5962
- https://github.com/neondatabase/neon/issues/5963
- https://github.com/neondatabase/neon/pull/5955
- https://github.com/neondatabase/cloud/issues/7408
---
 pageserver/src/metrics.rs                     | 128 ++++++++++++++++++
 pageserver/src/tenant/timeline.rs             |  14 +-
 .../src/tenant/timeline/logical_size.rs       |  19 ++-
 3 files changed, 154 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c15b1fce63..6aee40b579 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -403,6 +403,134 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+pub(crate) mod initial_logical_size {
+    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+    use once_cell::sync::Lazy;
+
+    use crate::task_mgr::TaskKind;
+
+    pub(crate) struct StartCalculation(IntCounterVec);
+    pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
+        StartCalculation(
+            register_int_counter_vec!(
+                "pageserver_initial_logical_size_start_calculation",
+                "Incremented each time we start an initial logical size calculation attempt. \
+                 The `task_kind` label is for the task kind that caused this attempt.",
+                &["attempt", "task_kind"]
+            )
+            .unwrap(),
+        )
+    });
+
+    struct DropCalculation {
+        first: IntCounter,
+        retry: IntCounter,
+    }
+
+    static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
+        let vec = register_int_counter_vec!(
+            "pageserver_initial_logical_size_drop_calculation",
+            "Incremented each time we abort a started size calculation attmpt.",
+            &["attempt"]
+        )
+        .unwrap();
+        DropCalculation {
+            first: vec.with_label_values(&["first"]),
+            retry: vec.with_label_values(&["retry"]),
+        }
+    });
+
+    pub(crate) struct Calculated {
+        pub(crate) births: IntCounter,
+        pub(crate) deaths: IntCounter,
+    }
+
+    pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
+        births: register_int_counter!(
+            "pageserver_initial_logical_size_finish_calculation",
+            "Incremented every time we finish calculation of initial logical size.\
+             If everything is working well, this should happen at most once per Timeline object."
+        )
+        .unwrap(),
+        deaths: register_int_counter!(
+            "pageserver_initial_logical_size_drop_finished_calculation",
+            "Incremented when we drop a finished initial logical size calculation result.\
+             Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
+        )
+        .unwrap(),
+    });
+
+    pub(crate) struct OngoingCalculationGuard {
+        inc_drop_calculation: Option<IntCounter>,
+    }
+
+    impl StartCalculation {
+        pub(crate) fn first(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
+            let task_kind_label: &'static str =
+                causing_task_kind.map(|k| k.into()).unwrap_or_default();
+            self.0.with_label_values(&["first", task_kind_label]);
+            OngoingCalculationGuard {
+                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
+            }
+        }
+        pub(crate) fn retry(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
+            let task_kind_label: &'static str =
+                causing_task_kind.map(|k| k.into()).unwrap_or_default();
+            self.0.with_label_values(&["retry", task_kind_label]);
+            OngoingCalculationGuard {
+                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
+            }
+        }
+    }
+
+    impl Drop for OngoingCalculationGuard {
+        fn drop(&mut self) {
+            if let Some(counter) = self.inc_drop_calculation.take() {
+                counter.inc();
+            }
+        }
+    }
+
+    impl OngoingCalculationGuard {
+        pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
+            drop(self.inc_drop_calculation.take());
+            CALCULATED.births.inc();
+            FinishedCalculationGuard {
+                inc_on_drop: CALCULATED.deaths.clone(),
+            }
+        }
+    }
+
+    pub(crate) struct FinishedCalculationGuard {
+        inc_on_drop: IntCounter,
+    }
+
+    impl Drop for FinishedCalculationGuard {
+        fn drop(&mut self) {
+            self.inc_on_drop.inc();
+        }
+    }
+
+    pub(crate) struct Calls {
+        pub(crate) approximate: IntCounter,
+        pub(crate) exact: IntCounter,
+    }
+
+    pub(crate) static CALLS: Lazy<Calls> = Lazy::new(|| {
+        let vec = register_int_counter_vec!(
+            "pageserver_initial_logical_size_calls",
+            "Incremented each time some code asks for incremental logical size.\
+             The label records the accuracy of the result.",
+            &["accuracy"]
+        )
+        .unwrap();
+        Calls {
+            approximate: vec.with_label_values(&["approximate"]),
+            exact: vec.with_label_values(&["exact"]),
+        }
+    });
+}
+
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_tenant_states_count",
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9a7c9a6df3..341ec3e255 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1812,6 +1812,7 @@ impl Timeline {
             "spawning logical size computation from context of task kind {:?}",
             ctx.task_kind()
         );
+        let causing_task_kind = ctx.task_kind();
         // We need to start the computation task.
         // It gets a separate context since it will outlive the request that called this function.
         let self_clone = Arc::clone(self);
@@ -1839,6 +1840,8 @@ impl Timeline {
                     _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
                 };
 
+
+
                 // hold off background tasks from starting until all timelines get to try at least
                 // once initial logical size calculation; though retry will rarely be useful.
                 // holding off is done because heavier tasks execute blockingly on the same
@@ -1846,7 +1849,12 @@ impl Timeline {
                 //
                 // dropping this at every outcome is probably better than trying to cling on to it,
                 // delay will be terminated by a timeout regardless.
-                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
+                let completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
+
+                let metrics_guard = match &completion {
+                    Some(_) => crate::metrics::initial_logical_size::START_CALCULATION.first(Some(causing_task_kind)),
+                    None => crate::metrics::initial_logical_size::START_CALCULATION.retry(Some(causing_task_kind)),
+                };
 
                 let calculated_size = match self_clone
                     .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
@@ -1891,11 +1899,11 @@ impl Timeline {
                 match self_clone
                     .current_logical_size
                     .initial_logical_size
-                    .set(calculated_size)
+                    .set((calculated_size, metrics_guard.calculation_result_saved()))
                 {
                     Ok(()) => (),
                     Err(_what_we_just_attempted_to_set) => {
-                        let existing_size = self_clone
+                        let (existing_size, _) = self_clone
                             .current_logical_size
                             .initial_logical_size
                             .get()
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index d9c2bc4cb9..43d8faeb0b 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -23,7 +23,10 @@ pub(super) struct LogicalSize {
     ///
     /// NOTE: size at a given LSN is constant, but after a restart we will calculate
     /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<u64>,
+    pub initial_logical_size: OnceCell<(
+        u64,
+        crate::metrics::initial_logical_size::FinishedCalculationGuard,
+    )>,
 
     /// Semaphore to track ongoing calculation of `initial_logical_size`.
     pub initial_size_computation: Arc<tokio::sync::Semaphore>,
@@ -78,7 +81,11 @@ impl CurrentLogicalSize {
 impl LogicalSize {
     pub(super) fn empty_initial() -> Self {
         Self {
-            initial_logical_size: OnceCell::with_value(0),
+            initial_logical_size: OnceCell::with_value((0, {
+                crate::metrics::initial_logical_size::START_CALCULATION
+                    .first(None)
+                    .calculation_result_saved()
+            })),
             //  initial_logical_size already computed, so, don't admit any calculations
             initial_size_computation: Arc::new(Semaphore::new(0)),
             initial_part_end: None,
@@ -100,12 +107,16 @@ impl LogicalSize {
         //                  ^^^ keep this type explicit so that the casts in this function break if
         //                  we change the type.
         match self.initial_logical_size.get() {
-            Some(initial_size) => {
+            Some((initial_size, _)) => {
+                crate::metrics::initial_logical_size::CALLS.exact.inc();
                 initial_size.checked_add_signed(size_increment)
                     .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
                     .map(CurrentLogicalSize::Exact)
             }
             None => {
+                crate::metrics::initial_logical_size::CALLS
+                    .approximate
+                    .inc();
                 let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
                 Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
             }
@@ -121,7 +132,7 @@ impl LogicalSize {
     /// available for re-use. This doesn't contain the incremental part.
     pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
         match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
             _ => None,
         }
     }

From 8cd28e17181124a1a0af122f3550e5a8e7877a8d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Wed, 29 Nov 2023 15:34:46 +0000
Subject: [PATCH 009/202] logical size calculation: make .current_size()
 infallible (#5999)

... by panicking on overflow;

It was made fallible initially due to in-confidence in logical size
calculation. However, the error has never happened since I am at Neon.

Let's stop worrying about this by converting the overflow check into a panic.
---
 pageserver/src/tenant/timeline.rs              | 8 +++-----
 pageserver/src/tenant/timeline/logical_size.rs | 8 ++++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 341ec3e255..2eaf015dd4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -859,7 +859,7 @@ impl Timeline {
         self: &Arc<Self>,
         ctx: &RequestContext,
     ) -> anyhow::Result<(u64, bool)> {
-        let current_size = self.current_logical_size.current_size()?;
+        let current_size = self.current_logical_size.current_size();
         debug!("Current size: {current_size:?}");
 
         let mut is_exact = true;
@@ -2057,16 +2057,14 @@ impl Timeline {
         // one value while current_logical_size is set to the
         // other.
         match logical_size.current_size() {
-            Ok(CurrentLogicalSize::Exact(new_current_size)) => self
+            CurrentLogicalSize::Exact(new_current_size) => self
                 .metrics
                 .current_logical_size_gauge
                 .set(new_current_size),
-            Ok(CurrentLogicalSize::Approximate(_)) => {
+            CurrentLogicalSize::Approximate(_) => {
                 // don't update the gauge yet, this allows us not to update the gauge back and
                 // forth between the initial size calculation task.
             }
-            // this is overflow
-            Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
         }
     }
 
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index 43d8faeb0b..ec93955254 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -102,23 +102,23 @@ impl LogicalSize {
         }
     }
 
-    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+    pub(super) fn current_size(&self) -> CurrentLogicalSize {
         let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
         //                  ^^^ keep this type explicit so that the casts in this function break if
         //                  we change the type.
         match self.initial_logical_size.get() {
             Some((initial_size, _)) => {
                 crate::metrics::initial_logical_size::CALLS.exact.inc();
-                initial_size.checked_add_signed(size_increment)
+                CurrentLogicalSize::Exact(initial_size.checked_add_signed(size_increment)
                     .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
+                    .unwrap())
             }
             None => {
                 crate::metrics::initial_logical_size::CALLS
                     .approximate
                     .inc();
                 let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+                CurrentLogicalSize::Approximate(non_negative_size_increment)
             }
         }
     }

From ce1652990d89c0d68aa2b788ebbd12c90e36dbc5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Wed, 29 Nov 2023 16:55:13 +0000
Subject: [PATCH 010/202] logical size: better represent level of accuracy in
 the type system (#5999)

I would love to not expose the in-accurate value int he mgmt API at all,
and in fact control plane doesn't use it [^1].
But our tests do, and I have no desire to change them at this time.

[^1]: https://github.com/neondatabase/cloud/pull/8317
---
 libs/pageserver_api/src/models.rs             |  4 +-
 pageserver/src/consumption_metrics/metrics.rs | 13 ++---
 pageserver/src/http/routes.rs                 | 14 +++---
 pageserver/src/tenant/timeline.rs             | 15 +++---
 .../src/tenant/timeline/logical_size.rs       | 49 +++++++++++++++----
 .../walreceiver/walreceiver_connection.rs     |  7 +--
 6 files changed, 63 insertions(+), 39 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e6236f91e3..2234a06501 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -384,7 +384,9 @@ pub struct TimelineInfo {
     /// The LSN that we are advertizing to safekeepers
     pub remote_consistent_lsn_visible: Lsn,
 
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    pub current_logical_size: u64,
+    pub current_logical_size_is_accurate: bool,
+
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 213f08484c..c6ff91e560 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,5 +1,4 @@
-use crate::context::RequestContext;
-use anyhow::Context;
+use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
@@ -352,13 +351,11 @@ impl TimelineSnapshot {
 
             let current_exact_logical_size = {
                 let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
+                let size = span.in_scope(|| t.get_current_logical_size(ctx));
+                match size {
                     // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
+                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
+                    CurrentLogicalSize::Approximate(_) => None,
                 }
             };
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7ab56ccab2..29a1ff52e8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -338,13 +338,7 @@ async fn build_timeline_info_common(
         Lsn(0) => None,
         lsn @ Lsn(_) => Some(lsn),
     };
-    let current_logical_size = match timeline.get_current_logical_size(ctx) {
-        Ok((size, _)) => Some(size),
-        Err(err) => {
-            error!("Timeline info creation failed to get current logical size: {err:?}");
-            None
-        }
-    };
+    let current_logical_size = timeline.get_current_logical_size(ctx);
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
     let remote_consistent_lsn_projected = timeline
@@ -368,7 +362,11 @@ async fn build_timeline_info_common(
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
         latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
-        current_logical_size,
+        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
+        current_logical_size_is_accurate: match current_logical_size.accuracy() {
+            tenant::timeline::logical_size::Accuracy::Approximate => false,
+            tenant::timeline::logical_size::Accuracy::Exact => true,
+        },
         current_physical_size,
         current_logical_size_non_incremental: None,
         timeline_dir_layer_file_size_sum: None,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2eaf015dd4..6313e7be84 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2,7 +2,7 @@ pub mod delete;
 mod eviction_task;
 mod init;
 pub mod layer_manager;
-mod logical_size;
+pub(crate) mod logical_size;
 pub mod span;
 pub mod uninit;
 mod walreceiver;
@@ -855,23 +855,20 @@ impl Timeline {
     /// the initial size calculation has not been run (gets triggered on the first size access).
     ///
     /// return size and boolean flag that shows if the size is exact
-    pub fn get_current_logical_size(
+    pub(crate) fn get_current_logical_size(
         self: &Arc<Self>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<(u64, bool)> {
+    ) -> logical_size::CurrentLogicalSize {
         let current_size = self.current_logical_size.current_size();
         debug!("Current size: {current_size:?}");
 
-        let mut is_exact = true;
-        let size = current_size.size();
         if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
-            is_exact = false;
             self.try_spawn_size_init_task(initial_part_end, ctx);
         }
 
-        Ok((size, is_exact))
+        current_size
     }
 
     /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
@@ -2057,10 +2054,10 @@ impl Timeline {
         // one value while current_logical_size is set to the
         // other.
         match logical_size.current_size() {
-            CurrentLogicalSize::Exact(new_current_size) => self
+            CurrentLogicalSize::Exact(ref new_current_size) => self
                 .metrics
                 .current_logical_size_gauge
-                .set(new_current_size),
+                .set(new_current_size.into()),
             CurrentLogicalSize::Approximate(_) => {
                 // don't update the gauge yet, this allows us not to update the gauge back and
                 // forth between the initial size calculation task.
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index ec93955254..1f103051ef 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -59,21 +59,50 @@ pub(super) struct LogicalSize {
 
 /// Normalized current size, that the data in pageserver occupies.
 #[derive(Debug, Clone, Copy)]
-pub(super) enum CurrentLogicalSize {
+pub(crate) enum CurrentLogicalSize {
     /// The size is not yet calculated to the end, this is an intermediate result,
     /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
     /// yet total logical size cannot be below 0.
-    Approximate(u64),
+    Approximate(Approximate),
     // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
     // available for observation without any calculations.
-    Exact(u64),
+    Exact(Exact),
+}
+
+#[derive(Debug, Copy, Clone)]
+pub(crate) enum Accuracy {
+    Approximate,
+    Exact,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct Approximate(u64);
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct Exact(u64);
+
+impl From<&Approximate> for u64 {
+    fn from(value: &Approximate) -> Self {
+        value.0
+    }
+}
+
+impl From<&Exact> for u64 {
+    fn from(val: &Exact) -> Self {
+        val.0
+    }
 }
 
 impl CurrentLogicalSize {
-    pub(super) fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
+    pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
+        match self {
+            Self::Approximate(size) => size.into(),
+            Self::Exact(size) => size.into(),
+        }
+    }
+    pub(crate) fn accuracy(&self) -> Accuracy {
+        match self {
+            Self::Approximate(_) => Accuracy::Approximate,
+            Self::Exact(_) => Accuracy::Exact,
         }
     }
 }
@@ -109,16 +138,16 @@ impl LogicalSize {
         match self.initial_logical_size.get() {
             Some((initial_size, _)) => {
                 crate::metrics::initial_logical_size::CALLS.exact.inc();
-                CurrentLogicalSize::Exact(initial_size.checked_add_signed(size_increment)
+                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                     .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .unwrap())
+                    .unwrap()))
             }
             None => {
                 crate::metrics::initial_logical_size::CALLS
                     .approximate
                     .inc();
                 let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                CurrentLogicalSize::Approximate(non_negative_size_increment)
+                CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
             }
         }
     }
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 2b4aea7596..7045658f24 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -396,11 +396,12 @@ pub(super) async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
-            let (timeline_logical_size, _) = timeline
+            let current_timeline_size = timeline
                 .get_current_logical_size(&ctx)
-                .context("Status update creation failed to get current logical size")?;
+                // FIXME: https://github.com/neondatabase/neon/issues/5963
+                .size_dont_care_about_accuracy();
             let status_update = PageserverFeedback {
-                current_timeline_size: timeline_logical_size,
+                current_timeline_size,
                 last_received_lsn,
                 disk_consistent_lsn,
                 remote_consistent_lsn,

From 7eabfc40eeb3303ca2a0ce041964cce2cbd861b0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 1 Dec 2023 13:26:19 +0000
Subject: [PATCH 011/202] test_runner: use separate directory for each rerun
 (#6004)

## Problem

While investigating https://github.com/neondatabase/neon/issues/5854, we
hypothesised that logs/repo-dir from the initial failure might leak into
reruns. Use different directories for each run to avoid such a
possibility.

## Summary of changes
- make each test rerun use different directories
- update `pytest-rerunfailure` plugin from 11.1.2 to 13.0
---
 poetry.lock                           | 20 +++++---------------
 pyproject.toml                        |  2 +-
 test_runner/fixtures/neon_fixtures.py |  5 +++++
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 830f80dc97..a85325b696 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1967,18 +1967,18 @@ pytest = [
 
 [[package]]
 name = "pytest-rerunfailures"
-version = "11.1.2"
+version = "13.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
-    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
+    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
+    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
 ]
 
 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=5.3"
+pytest = ">=7"
 
 [[package]]
 name = "pytest-split"
@@ -2476,16 +2476,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2707,4 +2697,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "25ffa9ed98d890a3b85e6036792296a60bb705e8f9eaa1f07336501116a58756"
+content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
diff --git a/pyproject.toml b/pyproject.toml
index 60cf0e5c96..536efeab56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
 aiohttp = "3.9.0"
-pytest-rerunfailures = "^11.1.2"
+pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 188851c39f..9545dc2dd5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3029,6 +3029,11 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     """Compute the working directory for an individual test."""
     test_name = request.node.name
     test_dir = top_output_dir / test_name.replace("/", "-")
+
+    # We rerun flaky tests multiple times, use a separate directory for each run.
+    if (suffix := getattr(request.node, "execution_count", None)) is not None:
+        test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"
+
     log.info(f"get_test_output_dir is {test_dir}")
     # make mypy happy
     assert isinstance(test_dir, Path)

From 92e7cd40e88825a489c8d3c19c78ef001b59d06c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 1 Dec 2023 13:40:49 +0000
Subject: [PATCH 012/202] add sql_exporter to vm-image (#5949)

expose LFC metrics
---
 vm-image-spec.yaml | 85 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index d0ba742a63..6f0ebe5f66 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -13,6 +13,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
+  - name: sql-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -46,6 +50,77 @@ files:
           }
           memory {}
       }
+  - filename: sql_exporter.yml
+    content: |
+      # Configuration for sql_exporter
+      # Global defaults.
+      global:
+        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+        scrape_timeout: 10s
+        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+        scrape_timeout_offset: 500ms
+        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+        min_interval: 0s
+        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+        # as will concurrent scrapes.
+        max_connections: 1
+        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+        # always be the same as max_connections.
+        max_idle_connections: 1
+        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+        # If 0, connections are not closed due to a connection's age.
+        max_connection_lifetime: 5m
+
+      # The target to monitor and the collectors to execute on it.
+      target:
+        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+        # the schema gets dropped or replaced to match the driver expected DSN format.
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
+
+        # Collectors (referenced by name) to execute on the target.
+        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+        collectors: [neon_collector]
+
+      # Collector files specifies a list of globs. One collector definition is read from each matching file.
+      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+      collector_files:
+        - "neon_collector.yml"
+  - filename: neon_collector.yml
+    content: |
+      collector_name: neon_collector
+      metrics:
+      - metric_name: lfc_misses
+        type: gauge
+        help: 'lfc_misses'
+        key_labels:
+        values: [lfc_misses]
+        query: |
+          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+      - metric_name: lfc_used
+        type: gauge
+        help: 'lfc_used'
+        key_labels:
+        values: [lfc_used]
+        query: |
+          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+      - metric_name: lfc_hits
+        type: gauge
+        help: 'lfc_hits'
+        key_labels:
+        values: [lfc_hits]
+        query: |
+          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+      - metric_name: lfc_writes
+        type: gauge
+        help: 'lfc_writes'
+        key_labels:
+        values: [lfc_writes]
+        query: |
+          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
 build: |
   # Build cgroup-tools
   #
@@ -82,6 +157,8 @@ build: |
 
   FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
 
+  FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
   # Build pgbouncer
   #
   FROM debian:bullseye-slim AS pgbouncer
@@ -116,13 +193,19 @@ merge: |
 
   COPY cgconfig.conf /etc/cgconfig.conf
   COPY pgbouncer.ini /etc/pgbouncer.ini
+  COPY sql_exporter.yml /etc/sql_exporter.yml
+  COPY neon_collector.yml /etc/neon_collector.yml
+
   RUN set -e \
       && chown postgres:postgres /etc/pgbouncer.ini \
       && chmod 0644 /etc/pgbouncer.ini \
-      && chmod 0644 /etc/cgconfig.conf
+      && chmod 0644 /etc/cgconfig.conf \
+      && chmod 0644 /etc/sql_exporter.yml \
+      && chmod 0644 /etc/neon_collector.yml
 
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
   COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+  COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
   COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer

From e49c21a3cd21469ab2a6362326cbb85f4c27e7fa Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Fri, 1 Dec 2023 09:11:41 -0500
Subject: [PATCH 013/202] Speed up rel extend (#5983)

---
 pageserver/src/walingest.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index d6e01d01be..cbb08f7ff1 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1437,7 +1437,16 @@ impl<'a> WalIngest<'a> {
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self
+
+        // Get current size and put rel creation if rel doesn't exist
+        //
+        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
+        //       check the cache too. This is because eagerly checking the cache results in
+        //       less work overall and 10% better performance. It's more work on cache miss
+        //       but cache miss is rare.
+        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
+            nblocks
+        } else if !self
             .timeline
             .get_rel_exists(rel, last_lsn, true, ctx)
             .await?

From fd81945a60dff25fea65a50f2c8cd6e4c955fba4 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Fri, 1 Dec 2023 09:16:24 -0500
Subject: [PATCH 014/202] Use TEST_OUTPUT envvar in pageserver (#5984)

---
 pageserver/src/config.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 737495d414..13d1fc775b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -855,7 +855,8 @@ impl PageServerConf {
 
     #[cfg(test)]
     pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
-        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
+        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
     }
 
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {

From 711425cc4757a325c5e496f4e9843b351605fccc Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 1 Dec 2023 18:30:51 +0200
Subject: [PATCH 015/202] fix: use create_new instead of create for mutex file
 (#6012)

Using create_new makes the uninit marker work as a mutual exclusion
primitive. Temporary hopefully.
---
 pageserver/src/tenant.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 12610b1c89..422cb671fe 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3239,7 +3239,10 @@ impl Tenant {
         let uninit_mark_path = self
             .conf
             .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
-        fs::File::create(&uninit_mark_path)
+        fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(&uninit_mark_path)
             .context("Failed to create uninit mark file")
             .and_then(|_| {
                 crashsafe::fsync_file_and_parent(&uninit_mark_path)

From c1295bfb3ab27f85dd1a9f5f30ac85746df10ab6 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 1 Dec 2023 18:19:55 +0100
Subject: [PATCH 016/202] [compute_ctl] User correct HTTP code in the
 /configure errors (#6017)

It was using `PRECONDITION_FAILED` for errors during `ComputeSpec` to
`ParsedSpec` conversion, but this disobeys the OpenAPI spec [1] and
correct code should be `BAD_REQUEST` for any spec processing errors.

While on it, I also noticed that `compute_ctl` OpenAPI spec has an
invalid format and fixed it.

[1] https://github.com/neondatabase/neon/blob/fd81945a60dff25fea65a50f2c8cd6e4c955fba4/compute_tools/src/http/openapi_spec.yaml#L119-L120
---
 compute_tools/src/http/api.rs            |  2 +-
 compute_tools/src/http/openapi_spec.yaml | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index ef6ca6eee3..fa2c4cff28 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -227,7 +227,7 @@ async fn handle_configure_request(
 
         let parsed_spec = match ParsedSpec::try_from(spec) {
             Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
+            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
         };
 
         // XXX: wrap state update under lock in code blocks. Otherwise,
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index dc26cc63eb..cedc6ece8f 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -156,17 +156,17 @@ paths:
                 description: Error text or 'OK' if download succeeded.
                 example: "OK"
         400:
-        description: Request is invalid.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Request is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
         500:
-        description: Extension download request failed.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Extension download request failed.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
 
 components:
   securitySchemes:

From e43cde7aba2681d8d2e703fdfdb963caccf5acd0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 1 Dec 2023 22:45:04 +0100
Subject: [PATCH 017/202] initial logical size: remove CALLS metric from hot
 path (#6018)

Only introduced a few hours ago (#5995), I took a look at the numbers
from staging and realized that `get_current_logical_size()` is on the
walingest hot path: we call it for every `ReplicationMessage::XLogData`
that we receive.

Since the metric is global, it would be quite a busy cache line.

This PR replaces it with a new metric purpose-built for what's most
interesting right now.
---
 pageserver/src/metrics.rs                     | 28 +++++++------------
 pageserver/src/tenant/timeline.rs             | 18 ++++++++++++
 .../src/tenant/timeline/logical_size.rs       | 12 ++++----
 3 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6aee40b579..d2684691e0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -511,24 +511,16 @@ pub(crate) mod initial_logical_size {
         }
     }
 
-    pub(crate) struct Calls {
-        pub(crate) approximate: IntCounter,
-        pub(crate) exact: IntCounter,
-    }
-
-    pub(crate) static CALLS: Lazy<Calls> = Lazy::new(|| {
-        let vec = register_int_counter_vec!(
-            "pageserver_initial_logical_size_calls",
-            "Incremented each time some code asks for incremental logical size.\
-             The label records the accuracy of the result.",
-            &["accuracy"]
-        )
-        .unwrap();
-        Calls {
-            approximate: vec.with_label_values(&["approximate"]),
-            exact: vec.with_label_values(&["exact"]),
-        }
-    });
+    // context: https://github.com/neondatabase/neon/issues/5963
+    pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
+        Lazy::new(|| {
+            register_int_counter!(
+                "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
+                "Counter for the following event: walreceiver calls\
+                 Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
+            )
+            .unwrap()
+        });
 }
 
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6313e7be84..bf4e19e5fb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -868,6 +868,24 @@ impl Timeline {
             self.try_spawn_size_init_task(initial_part_end, ctx);
         }
 
+        if let CurrentLogicalSize::Approximate(_) = &current_size {
+            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
+                let first = self
+                    .current_logical_size
+                    .did_return_approximate_to_walreceiver
+                    .compare_exchange(
+                        false,
+                        true,
+                        AtomicOrdering::Relaxed,
+                        AtomicOrdering::Relaxed,
+                    )
+                    .is_ok();
+                if first {
+                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
+                }
+            }
+        }
+
         current_size
     }
 
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index 1f103051ef..a33fb28ebd 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -4,7 +4,7 @@ use once_cell::sync::OnceCell;
 use tokio::sync::Semaphore;
 use utils::lsn::Lsn;
 
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 use std::sync::Arc;
 
 /// Internal structure to hold all data needed for logical size calculation.
@@ -55,6 +55,9 @@ pub(super) struct LogicalSize {
     /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
     /// to modify this, it will also keep the prometheus metric in sync.
     pub size_added_after_initial: AtomicI64,
+
+    /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
+    pub(super) did_return_approximate_to_walreceiver: AtomicBool,
 }
 
 /// Normalized current size, that the data in pageserver occupies.
@@ -119,6 +122,7 @@ impl LogicalSize {
             initial_size_computation: Arc::new(Semaphore::new(0)),
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
+            did_return_approximate_to_walreceiver: AtomicBool::new(false),
         }
     }
 
@@ -128,6 +132,7 @@ impl LogicalSize {
             initial_size_computation: Arc::new(Semaphore::new(1)),
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
+            did_return_approximate_to_walreceiver: AtomicBool::new(false),
         }
     }
 
@@ -137,15 +142,12 @@ impl LogicalSize {
         //                  we change the type.
         match self.initial_logical_size.get() {
             Some((initial_size, _)) => {
-                crate::metrics::initial_logical_size::CALLS.exact.inc();
                 CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                     .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
                     .unwrap()))
             }
             None => {
-                crate::metrics::initial_logical_size::CALLS
-                    .approximate
-                    .inc();
+
                 let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
                 CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
             }

From e3512340c1855977927fa43352a5afcaa5b94fe9 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Sun, 3 Dec 2023 15:21:44 +0000
Subject: [PATCH 018/202] Override neon.max_cluster_size for the time of
 compute_ctl (#5998)

Temporarily reset neon.max_cluster_size to avoid
the possibility of hitting the limit, while we are applying config:
creating new extensions, roles, etc...
---
 compute_tools/src/compute.rs              | 21 +++++++-
 compute_tools/src/config.rs               | 20 +++++++
 compute_tools/src/spec.rs                 | 13 -----
 test_runner/regress/test_timeline_size.py | 66 +++++++++++++++++++++++
 4 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0dfacb615c..28770acdcd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -728,7 +728,12 @@ impl ComputeNode {
 
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        // temporarily reset max_cluster_size in config
+        // to avoid the possibility of hitting the limit, while we are reconfiguring:
+        // creating new extensions, roles, etc...
+        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
         self.pg_reload_conf()?;
 
         let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
@@ -749,6 +754,10 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
+        // reset max_cluster_size in config back to original value and reload config
+        config::compute_ctl_temp_override_remove(pgdata_path)?;
+        self.pg_reload_conf()?;
+
         let unknown_op = "unknown".to_string();
         let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
         info!(
@@ -809,7 +818,17 @@ impl ComputeNode {
 
         let config_time = Utc::now();
         if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+            self.pg_reload_conf()?;
+
             self.apply_config(&compute_state)?;
+
+            config::compute_ctl_temp_override_remove(pgdata_path)?;
+            self.pg_reload_conf()?;
         }
 
         let startup_end_time = Utc::now();
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index bc48a2110d..a7ef8cea92 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -93,5 +93,25 @@ pub fn write_postgres_conf(
         writeln!(file, "neon.extension_server_port={}", port)?;
     }
 
+    // This is essential to keep this line at the end of the file,
+    // because it is intended to override any settings above.
+    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
+
+    Ok(())
+}
+
+/// create file compute_ctl_temp_override.conf in pgdata_dir
+/// add provided options to this file
+pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    let mut file = File::create(path)?;
+    write!(file, "{}", options)?;
+    Ok(())
+}
+
+/// remove file compute_ctl_temp_override.conf in pgdata_dir
+pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    std::fs::remove_file(path)?;
     Ok(())
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 8c44c6d519..f98333d8bf 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane(
     spec
 }
 
-/// It takes cluster specification and does the following:
-/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
-/// - Update `pg_hba.conf` to allow external connections.
-pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
-    // File `postgresql.conf` is no longer included into `basebackup`, so just
-    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
-
-    update_pg_hba(pgdata_path)?;
-
-    Ok(())
-}
-
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
     // XXX: consider making it a part of spec.json
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index eb98348823..24cbe34457 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -146,6 +146,72 @@ def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, tim
         time.sleep(polling_interval)
 
 
+def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
+
+    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+
+    endpoint_main = env.endpoints.create(
+        "test_timeline_size_quota_on_startup",
+        # Set small limit for the test
+        config_lines=["neon.max_cluster_size=30MB"],
+    )
+    endpoint_main.start()
+
+    log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch")
+
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE TABLE foo (t text)")
+
+            # Insert many rows. This query must fail because of space limit
+            try:
+                for _i in range(5000):
+                    cur.execute(
+                        """
+                        INSERT INTO foo
+                            SELECT 'long string to consume some space' || g
+                            FROM generate_series(1, 100) g
+                    """
+                    )
+
+                # If we get here, the timeline size limit failed
+                log.error("Query unexpectedly succeeded")
+                raise AssertionError()
+
+            except psycopg2.errors.DiskFull as err:
+                log.info(f"Query expectedly failed with: {err}")
+
+    # Restart endpoint that reached the limit to ensure that it doesn't fail on startup
+    # i.e. the size limit is not enforced during startup.
+    endpoint_main.stop()
+    # don't skip pg_catalog updates - it runs CREATE EXTENSION neon
+    # which is needed for neon.pg_cluster_size() to work
+    endpoint_main.respec(skip_pg_catalog_updates=False)
+    endpoint_main.start()
+
+    # ensure that the limit is enforced after startup
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            # This query must fail because of space limit
+            try:
+                cur.execute(
+                    """
+                    INSERT INTO foo
+                        SELECT 'long string to consume some space' || g
+                        FROM generate_series(1, 100000) g
+                """
+                )
+                # If we get here, the timeline size limit failed
+                log.error("Query unexpectedly succeeded")
+                raise AssertionError()
+
+            except psycopg2.errors.DiskFull as err:
+                log.info(f"Query expectedly failed with: {err}")
+
+
 def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()

From 1d81e70d601f6d8d823d1dc0693efdd424ea18c8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Dec 2023 09:57:47 +0000
Subject: [PATCH 019/202] pageserver: tweak logs for index_part loading (#6005)

## Problem

On pageservers upgraded to enable generations, these INFO level logs
were rather frequent. If a tenant timeline hasn't written new layers
since the upgrade, it will emit the "No index_part.json*" log every time
it starts.

## Summary of changes

- Downgrade two log lines from info to debug
- Add a tiny unit test that I wrote for sanity-checking that there
wasn't something wrong with our Generation-comparing logic when loading
index parts.
---
 libs/remote_storage/src/s3_bucket.rs                |  2 +-
 libs/utils/src/generation.rs                        | 13 +++++++++++++
 .../src/tenant/remote_timeline_client/download.rs   |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0cb73f73b7..3016a14ec9 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -378,7 +378,7 @@ impl RemoteStorage for S3Bucket {
             let empty = Vec::new();
             let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
 
-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
 
             for object in keys {
                 let object_path = object.key().expect("response does not contain a key");
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 49e290dab8..46eadee1da 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,3 +152,16 @@ impl Debug for Generation {
         }
     }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn generation_gt() {
+        // Important that a None generation compares less than a valid one, during upgrades from
+        // pre-generation systems.
+        assert!(Generation::none() < Generation::new(0));
+        assert!(Generation::none() < Generation::new(1));
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 3b5fe4b207..deb5ea84a8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -363,7 +363,7 @@ pub(super) async fn download_index_part(
         None => {
             // Migration from legacy pre-generation state: we have a generation but no prior
             // attached pageservers did.  Try to load from a no-generation path.
-            tracing::info!("No index_part.json* found");
+            tracing::debug!("No index_part.json* found");
             do_download_index_part(
                 storage,
                 tenant_shard_id,

From e6b2f89fecb2b9129a1d6b00ad370bf81bfc4177 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Dec 2023 11:18:41 +0000
Subject: [PATCH 020/202] test_pg_clients: fix test that reads from stdout
 (#6021)

## Problem

`test_pg_clients` reads the actual result from a *.stdout file,
https://github.com/neondatabase/neon/pull/5977 has added a header to
such files, so `test_pg_clients` started to fail.

## Summary of changes
- Use `capture_stdout` and compare the expected result with the output
instead of *.stdout file content
---
 test_runner/pg_clients/test_pg_clients.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py
index 8381eac946..3579c92b0c 100644
--- a/test_runner/pg_clients/test_pg_clients.py
+++ b/test_runner/pg_clients/test_pg_clients.py
@@ -48,6 +48,6 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st
     subprocess_capture(test_output_dir, build_cmd, check=True)
 
     run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
-    basepath, _, _ = subprocess_capture(test_output_dir, run_cmd, check=True)
+    _, output, _ = subprocess_capture(test_output_dir, run_cmd, check=True, capture_stdout=True)
 
-    assert Path(f"{basepath}.stdout").read_text().strip() == "1"
+    assert str(output).strip() == "1"

From eae49ff598c5cac3986c3b9d072a456c6a392f23 Mon Sep 17 00:00:00 2001
From: John Khvatov <ivaxer@yandex-team.com>
Date: Mon, 4 Dec 2023 15:35:09 +0300
Subject: [PATCH 021/202] Perform L0 compaction before creating new image
 layers (#5950)

If there are too many L0 layers before compaction, the compaction
process becomes slow because of slow `Timeline::get`. As a result of the
slowdown, the pageserver will generate even more L0 layers for the next
iteration, further exacerbating the slow performance.

Change to perform L0 -> L1 compaction before creating new images. The
simple change speeds up compaction time and `Timeline::get` to 5x.
`Timeline::get` is faster on top of L1 layers.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/timeline.rs              | 12 ++++++------
 test_runner/regress/test_layers_from_future.py |  6 +++++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bf4e19e5fb..e252ee584e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -801,7 +801,12 @@ impl Timeline {
                     .access_stats_behavior(AccessStatsBehavior::Skip)
                     .build();
 
-                // 2. Create new image layers for partitions that have been modified
+                // 2. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
+                // 3. Create new image layers for partitions that have been modified
                 // "enough".
                 let layers = self
                     .create_image_layers(&partitioning, lsn, false, &image_ctx)
@@ -813,11 +818,6 @@ impl Timeline {
                     }
                 }
 
-                // 3. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
                 if let Some(remote_client) = &self.remote_client {
                     // should any new image layer been created, not uploading index_part will
                     // result in a mismatch between remote_physical_size and layermap calculated
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index b36c927628..7ce0bdaeba 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -49,7 +49,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
         "compaction_period": "0s",  # we want to control when compaction runs
         "checkpoint_timeout": "24h",  # something we won't reach
         "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
-        "image_creation_threshold": f"{image_creation_threshold}",
+        "image_creation_threshold": "100",  # we want to control when image is created
         "compaction_threshold": f"{l0_l1_threshold}",
         "compaction_target_size": f"{128 * (1024**3)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
     }
@@ -124,6 +124,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     ), "sanity check for what above loop is supposed to do"
 
     # create the image layer from the future
+    ps_http.patch_tenant_config_client_side(
+        tenant_id, {"image_creation_threshold": image_creation_threshold}, None
+    )
+    assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
     ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
     assert (
         len(

From 207c527270c461868e723794a9183020e8cc14cc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 30 Nov 2023 16:35:16 +0300
Subject: [PATCH 022/202] Safekeepers: persist state before timeline
 deactivation.

Without it, sometimes on restart we lose latest remote_consistent_lsn which
leads to excessive ps -> sk reconnections.

https://github.com/neondatabase/neon/issues/5993
---
 safekeeper/src/safekeeper.rs             | 19 ++++++------
 safekeeper/src/timeline.rs               | 37 +++++++++++++++---------
 test_runner/regress/test_wal_acceptor.py | 31 +++++++++++++++-----
 3 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 47a624281d..217a5f89ee 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -914,9 +914,14 @@ where
         Ok(())
     }
 
-    /// Persist control file to disk, called only after timeline creation (bootstrap).
-    pub async fn persist(&mut self) -> Result<()> {
-        self.persist_control_file(self.state.clone()).await
+    /// Persist in-memory state of control file to disk.
+    //
+    // TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better
+    // separate state completely and give Arc to all those who need it.
+    pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
+        let mut state = self.state.clone();
+        state.remote_consistent_lsn = inmem_remote_consistent_lsn;
+        self.persist_control_file(state).await
     }
 
     /// Persist in-memory state to the disk, taking other data from state.
@@ -930,7 +935,7 @@ where
 
     /// Persist control file if there is something to save and enough time
     /// passed after the last save.
-    pub async fn maybe_persist_control_file(
+    pub async fn maybe_persist_inmem_control_file(
         &mut self,
         inmem_remote_consistent_lsn: Lsn,
     ) -> Result<()> {
@@ -943,9 +948,7 @@ where
             || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
             || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn;
         if need_persist {
-            let mut state = self.state.clone();
-            state.remote_consistent_lsn = inmem_remote_consistent_lsn;
-            self.persist_control_file(state).await?;
+            self.persist_inmem(inmem_remote_consistent_lsn).await?;
             trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
         }
         Ok(())
@@ -1064,8 +1067,6 @@ where
 
         if sync_control_file {
             let mut state = self.state.clone();
-            // Note: we could make remote_consistent_lsn update in cf common by
-            // storing Arc to walsenders in Safekeeper.
             state.remote_consistent_lsn = new_remote_consistent_lsn;
             self.persist_control_file(state).await?;
         }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 2ba871207e..bdc9088138 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -182,8 +182,9 @@ impl SharedState {
     }
 
     /// Mark timeline active/inactive and return whether s3 offloading requires
-    /// start/stop action.
-    fn update_status(
+    /// start/stop action. If timeline is deactivated, control file is persisted
+    /// as maintenance task does that only for active timelines.
+    async fn update_status(
         &mut self,
         num_computes: usize,
         remote_consistent_lsn: Lsn,
@@ -191,7 +192,15 @@ impl SharedState {
     ) -> bool {
         let is_active = self.is_active(num_computes, remote_consistent_lsn);
         if self.active != is_active {
-            info!("timeline {} active={} now", ttid, is_active);
+            info!(
+                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn
+            );
+            if !is_active {
+                if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await {
+                    warn!("control file save in update_status failed: {:?}", e);
+                }
+            }
         }
         self.active = is_active;
         self.is_wal_backup_action_pending(num_computes)
@@ -438,7 +447,7 @@ impl Timeline {
         fs::create_dir_all(&self.timeline_dir).await?;
 
         // Write timeline to disk and start background tasks.
-        if let Err(e) = shared_state.sk.persist().await {
+        if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await {
             // Bootstrap failed, cancel timeline and remove timeline directory.
             self.cancel(shared_state);
 
@@ -511,12 +520,14 @@ impl Timeline {
         self.mutex.lock().await
     }
 
-    fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state.update_status(
-            self.walreceivers.get_num(),
-            self.get_walsenders().get_remote_consistent_lsn(),
-            self.ttid,
-        )
+    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
+        shared_state
+            .update_status(
+                self.walreceivers.get_num(),
+                self.get_walsenders().get_remote_consistent_lsn(),
+                self.ttid,
+            )
+            .await
     }
 
     /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
@@ -526,7 +537,7 @@ impl Timeline {
         }
         let is_wal_backup_action_pending: bool = {
             let mut shared_state = self.write_shared_state().await;
-            self.update_status(&mut shared_state)
+            self.update_status(&mut shared_state).await
         };
         if is_wal_backup_action_pending {
             // Can fail only if channel to a static thread got closed, which is not normal at all.
@@ -683,7 +694,7 @@ impl Timeline {
             shared_state.sk.record_safekeeper_info(&sk_info).await?;
             let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = self.update_status(&mut shared_state);
+            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
             commit_lsn = shared_state.sk.inmem.commit_lsn;
         }
         self.commit_lsn_watch_tx.send(commit_lsn)?;
@@ -828,7 +839,7 @@ impl Timeline {
         self.write_shared_state()
             .await
             .sk
-            .maybe_persist_control_file(remote_consistent_lsn)
+            .maybe_persist_inmem_control_file(remote_consistent_lsn)
             .await
     }
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index b7eaaf39bc..ad12b56874 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -30,6 +30,7 @@ from fixtures.neon_fixtures import (
     Safekeeper,
     SafekeeperHttpClient,
     SafekeeperPort,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -286,29 +287,43 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
     # wait until remote_consistent_lsn gets advanced on all safekeepers
     clients = [sk.http_client() for sk in env.safekeepers]
     stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-    log.info(f"statuses is {stat_before}")
+    log.info(f"statuses before insert: {stat_before}")
 
     endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")
 
-    # force checkpoint in pageserver to advance remote_consistent_lsn
-    wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver)
+    # wait for remote_consistent_lsn to reach flush_lsn, forcing it with checkpoint
+    new_rcl = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+    log.info(f"new_rcl: {new_rcl}")
+    endpoint.stop()
 
     # and wait till remote_consistent_lsn propagates to all safekeepers
+    #
+    # TODO: this executes long as timeline on safekeeper is immediately
+    # deactivated once rcl reaches pageserver one, and thus we generally wait
+    # till pageserver reconnects to all safekeepers one by one here. Timeline
+    # status on safekeeper should take into account peers state as well.
     started_at = time.time()
     while True:
         stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-        if all(
-            s_after.remote_consistent_lsn > s_before.remote_consistent_lsn
-            for s_after, s_before in zip(stat_after, stat_before)
-        ):
+        if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
             break
         elapsed = time.time() - started_at
-        if elapsed > 20:
+        if elapsed > 30:
             raise RuntimeError(
                 f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
             )
         time.sleep(1)
 
+    # Ensure that safekeepers don't lose remote_consistent_lsn on restart.
+    # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep.
+    time.sleep(6)
+    for sk in env.safekeepers:
+        sk.stop()
+        sk.start()
+    stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
+    log.info(f"statuses after {stat_after_restart}")
+    assert all([s.remote_consistent_lsn >= new_rcl for s in stat_after_restart])
+
 
 # Test that old WAL consumed by peers and pageserver is removed from safekeepers.
 @pytest.mark.parametrize("auth_enabled", [False, True])

From 12f02523a46819a649570daf42178bfe6cc2e03b Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Dec 2023 16:00:24 +0100
Subject: [PATCH 023/202] Enable dynamic rate limiter (#6029)

## Problem

Limit the number of open connections between the control plane and
proxy.

## Summary of changes

Enable dynamic rate limiter in prod.

Unfortunately the latency metrics are a bit broken, but from logs I see
that on staging for the past 7 days only 2 times latency for acquiring
was greater than 1ms (for most of the cases it's insignificant).
---
 proxy/src/bin/proxy.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7457e26867..6c4189de75 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -103,7 +103,7 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
     /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_dynamic_rate_limiter: bool,
     /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
     #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]

From 7403d55013e44aa7f1a7420676cdcb1fde95335e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Dec 2023 17:06:41 +0100
Subject: [PATCH 024/202] walredo: stderr cleanup & make explicitly cancel safe
 (#6031)

# Problem

I need walredo to be cancellation-safe for
https://github.com/neondatabase/neon/pull/6000#discussion_r1412049728

# Solution

We are only `async fn` because of
`wait_for(stderr_logger_task_done).await`, added in #5560 .

The `stderr_logger_cancel` and `stderr_logger_task_done` were there out
of precaution that the stderr logger task might for some reason not stop
when the walredo process terminates.
That hasn't been a problem in practice.
So, simplify things:
- remove `stderr_logger_cancel` and the
`wait_for(...stderr_logger_task_done...)`
- use `tokio::process::ChildStderr` in the stderr logger task
- add metrics to track number of running stderr logger tasks so in case
I'm wrong here, we can use these metrics to identify the issue (not
planning to put them into a dashboard or anything)
---
 pageserver/src/metrics.rs | 17 +++++++
 pageserver/src/tenant.rs  |  6 +++
 pageserver/src/walredo.rs | 93 +++++++++++++--------------------------
 3 files changed, 53 insertions(+), 63 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d2684691e0..0cfbfcdf2f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1385,6 +1385,8 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
 pub(crate) struct WalRedoProcessCounters {
     pub(crate) started: IntCounter,
     pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+    pub(crate) active_stderr_logger_tasks_started: IntCounter,
+    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }
 
 #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1408,6 +1410,19 @@ impl Default for WalRedoProcessCounters {
             &["cause"],
         )
         .unwrap();
+
+        let active_stderr_logger_tasks_started = register_int_counter!(
+            "pageserver_walredo_stderr_logger_tasks_started_total",
+            "Number of active walredo stderr logger tasks that have started",
+        )
+        .unwrap();
+
+        let active_stderr_logger_tasks_finished = register_int_counter!(
+            "pageserver_walredo_stderr_logger_tasks_finished_total",
+            "Number of active walredo stderr logger tasks that have finished",
+        )
+        .unwrap();
+
         Self {
             started,
             killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1415,6 +1430,8 @@ impl Default for WalRedoProcessCounters {
                 let cause_str: &'static str = cause.into();
                 killed.with_label_values(&[cause_str])
             })),
+            active_stderr_logger_tasks_started,
+            active_stderr_logger_tasks_finished,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 422cb671fe..f67a4174af 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -312,6 +312,9 @@ impl WalRedoManager {
         }
     }
 
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     pub async fn request_redo(
         &self,
         key: crate::repository::Key,
@@ -3893,6 +3896,9 @@ pub(crate) mod harness {
     pub(crate) struct TestRedoManager;
 
     impl TestRedoManager {
+        /// # Cancel-Safety
+        ///
+        /// This method is cancellation-safe.
         pub async fn request_redo(
             &self,
             key: Key,
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index edce158e75..94e95fd3b3 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,7 +34,6 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
-use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
 
@@ -124,7 +123,9 @@ impl PostgresRedoManager {
     /// The WAL redo is handled by a separate thread, so this just sends a request
     /// to the thread and waits for response.
     ///
-    /// CANCEL SAFETY: NOT CANCEL SAFE.
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     pub async fn request_redo(
         &self,
         key: Key,
@@ -157,7 +158,6 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                     )
-                    .await
                 };
                 img = Some(result?);
 
@@ -178,7 +178,6 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
             )
-            .await
         }
     }
 }
@@ -216,7 +215,7 @@ impl PostgresRedoManager {
     /// Process one request for WAL redo using wal-redo postgres
     ///
     #[allow(clippy::too_many_arguments)]
-    async fn apply_batch_postgres(
+    fn apply_batch_postgres(
         &self,
         key: Key,
         lsn: Lsn,
@@ -332,12 +331,7 @@ impl PostgresRedoManager {
                 // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                 // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                 // This probably needs revisiting at some later point.
-                let mut wait_done = proc.stderr_logger_task_done.clone();
                 drop(proc);
-                wait_done
-                    .wait_for(|v| *v)
-                    .await
-                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
             } else if n_attempts != 0 {
                 info!(n_attempts, "retried walredo succeeded");
             }
@@ -649,8 +643,6 @@ struct WalRedoProcess {
     child: Option<NoLeakChild>,
     stdout: Mutex<ProcessOutput>,
     stdin: Mutex<ProcessInput>,
-    stderr_logger_cancel: CancellationToken,
-    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
     /// Counter to separate same sized walredo inputs failing at the same millisecond.
     #[cfg(feature = "testing")]
     dump_sequence: AtomicUsize,
@@ -699,6 +691,8 @@ impl WalRedoProcess {
         let stdin = child.stdin.take().unwrap();
         let stdout = child.stdout.take().unwrap();
         let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
         macro_rules! set_nonblock_or_log_err {
             ($file:ident) => {{
                 let res = set_nonblock($file.as_raw_fd());
@@ -710,69 +704,45 @@ impl WalRedoProcess {
         }
         set_nonblock_or_log_err!(stdin)?;
         set_nonblock_or_log_err!(stdout)?;
-        set_nonblock_or_log_err!(stderr)?;
-
-        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
 
         // all fallible operations post-spawn are complete, so get rid of the guard
         let child = scopeguard::ScopeGuard::into_inner(child);
 
-        let stderr_logger_cancel = CancellationToken::new();
-        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
-            tokio::sync::watch::channel(false);
-        tokio::spawn({
-            let stderr_logger_cancel = stderr_logger_cancel.clone();
+        tokio::spawn(
             async move {
                 scopeguard::defer! {
                     debug!("wal-redo-postgres stderr_logger_task finished");
-                    let _ = stderr_logger_task_done_tx.send(true);
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
                 }
                 debug!("wal-redo-postgres stderr_logger_task started");
-                loop {
-                    // NB: we purposefully don't do a select! for the cancellation here.
-                    // The cancellation would likely cause us to miss stderr messages.
-                    // We can rely on this to return from .await because when we SIGKILL
-                    // the child, the writing end of the stderr pipe gets closed.
-                    match stderr.readable_mut().await {
-                        Ok(mut guard) => {
-                            let mut errbuf = [0; 16384];
-                            let res = guard.try_io(|fd| {
-                                use std::io::Read;
-                                fd.get_mut().read(&mut errbuf)
-                            });
-                            match res {
-                                Ok(Ok(0)) => {
-                                    // it closed the stderr pipe
-                                    break;
-                                }
-                                Ok(Ok(n)) => {
-                                    // The message might not be split correctly into lines here. But this is
-                                    // good enough, the important thing is to get the message to the log.
-                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
-                                    error!(output, "received output");
-                                },
-                                Ok(Err(e)) => {
-                                    error!(error = ?e, "read() error, waiting for cancellation");
-                                    stderr_logger_cancel.cancelled().await;
-                                    error!(error = ?e, "read() error, cancellation complete");
-                                    break;
-                                }
-                                Err(e) => {
-                                    let _e: tokio::io::unix::TryIoError = e;
-                                    // the read() returned WouldBlock, that's expected
-                                }
-                            }
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
                         }
                         Err(e) => {
-                            error!(error = ?e, "read() error, waiting for cancellation");
-                            stderr_logger_cancel.cancelled().await;
-                            error!(error = ?e, "read() error, cancellation complete");
-                            break;
+                            break Err(e);
                         }
                     }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
                 }
             }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
-        });
+        );
 
         Ok(Self {
             conf,
@@ -787,8 +757,6 @@ impl WalRedoProcess {
                 pending_responses: VecDeque::new(),
                 n_processed_responses: 0,
             }),
-            stderr_logger_cancel,
-            stderr_logger_task_done: stderr_logger_task_done_rx,
             #[cfg(feature = "testing")]
             dump_sequence: AtomicUsize::default(),
         })
@@ -1029,7 +997,6 @@ impl Drop for WalRedoProcess {
             .take()
             .expect("we only do this once")
             .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        self.stderr_logger_cancel.cancel();
         // no way to wait for stderr_logger_task from Drop because that is async only
     }
 }

From c7f1143e570924eadd15053949647707ba042c5b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Dec 2023 18:22:26 +0100
Subject: [PATCH 025/202] concurrency-limit low-priority initial logical size
 calculation [v2] (#6000)

Problem
-------

Before this PR, there was no concurrency limit on initial logical size
computations.

While logical size computations are lazy in theory, in practice
(production), they happen in a short timeframe after restart.

This means that on a PS with 20k tenants, we'd have up to 20k concurrent
initial logical size calculation requests.

This is self-inflicted needless overload.

This hasn't been a problem so far because the `.await` points on the
logical size calculation path never return `Pending`, hence we have a
natural concurrency limit of the number of executor threads.
But, as soon as we return `Pending` somewhere in the logical size
calculation path, other concurrent tasks get scheduled by tokio.
If these other tasks are also logical size calculations, they eventually
pound on the same bottleneck.

For example, in #5479, we want to switch the VirtualFile descriptor
cache to a `tokio::sync::RwLock`, which makes us return `Pending`, and
without measures like this patch, after PS restart, VirtualFile
descriptor cache thrashes heavily for 2 hours until all the logical size
calculations have been computed and the degree of concurrency /
concurrent VirtualFile operations is down to regular levels.
See the *Experiment* section below for details.

<!-- Experiments (see below) show that plain #5479 causes heavy
thrashing of the VirtualFile descriptor cache.
The high degree of concurrency is too much for
In the case of #5479 the VirtualFile descriptor cache size starts
thrashing heavily.


-->

Background
----------

Before this PR, initial logical size calculation was spawned lazily on
first call to `Timeline::get_current_logical_size()`.

In practice (prod), the lazy calculation is triggered by
`WalReceiverConnectionHandler` if the timeline is active according to
storage broker, or by the first iteration of consumption metrics worker
after restart (`MetricsCollection`).

The spawns by walreceiver are high-priority because logical size is
needed by Safekeepers (via walreceiver `PageserverFeedback`) to enforce
the project logical size limit.
The spawns by metrics collection are not on the user-critical path and
hence low-priority. [^consumption_metrics_slo]

[^consumption_metrics_slo]: We can't delay metrics collection
indefintely because there are TBD internal SLOs tied to metrics
collection happening in a timeline manner
(https://github.com/neondatabase/cloud/issues/7408). But let's ignore
that in this issue.

The ratio of walreceiver-initiated spawns vs
consumption-metrics-initiated spawns can be reconstructed from logs
(`spawning logical size computation from context of task kind {:?}"`).
PR #5995 and #6018 adds metrics for this.

First investigation of the ratio lead to the discovery that walreceiver
spawns 75% of init logical size computations.
That's because of two bugs:
- In Safekeepers: https://github.com/neondatabase/neon/issues/5993
- In interaction between Pageservers and Safekeepers:
https://github.com/neondatabase/neon/issues/5962

The safekeeper bug is likely primarily responsible but we don't have the
data yet. The metrics will hopefully provide some insights.

When assessing production-readiness of this PR, please assume that
neither of these bugs are fixed yet.


Changes In This PR
------------------

With this PR, initial logical size calculation is reworked as follows:

First, all initial logical size calculation task_mgr tasks are started
early, as part of timeline activation, and run a retry loop with long
back-off until success. This removes the lazy computation; it was
needless complexity because in practice, we compute all logical sizes
anyways, because consumption metrics collects it.

Second, within the initial logical size calculation task, each attempt
queues behind the background loop concurrency limiter semaphore. This
fixes the performance issue that we pointed out in the "Problem" section
earlier.

Third, there is a twist to queuing behind the background loop
concurrency limiter semaphore. Logical size is needed by Safekeepers
(via walreceiver `PageserverFeedback`) to enforce the project logical
size limit. However, we currently do open walreceiver connections even
before we have an exact logical size. That's bad, and I'll build on top
of this PR to fix that
(https://github.com/neondatabase/neon/issues/5963). But, for the
purposes of this PR, we don't want to introduce a regression, i.e., we
don't want to provide an exact value later than before this PR. The
solution is to introduce a priority-boosting mechanism
(`GetLogicalSizePriority`), allowing callers of
`Timeline::get_current_logical_size` to specify how urgently they need
an exact value. The effect of specifying high urgency is that the
initial logical size calculation task for the timeline will skip the
concurrency limiting semaphore. This should yield effectively the same
behavior as we had before this PR with lazy spawning.

Last, the priority-boosting mechanism obsoletes the `init_order`'s grace
period for initial logical size calculations. It's a separate commit to
reduce the churn during review. We can drop that commit if people think
it's too much churn, and commit it later once we know this PR here
worked as intended.

Experiment With #5479
---------------------

I validated this PR combined with #5479 to assess whether we're making
forward progress towards asyncification.

The setup is an `i3en.3xlarge` instance with 20k tenants, each with one
timeline that has 9 layers.
All tenants are inactive, i.e., not known to SKs nor storage broker.
This means all initial logical size calculations are spawned by
consumption metrics `MetricsCollection` task kind.
The consumption metrics worker starts requesting logical sizes at low
priority immediately after restart. This is achieved by deleting the
consumption metrics cache file on disk before starting
PS.[^consumption_metrics_cache_file]

[^consumption_metrics_cache_file] Consumption metrics worker persists
its interval across restarts to achieve persistent reporting intervals
across PS restarts; delete the state file on disk to get predictable
(and I believe worst-case in terms of concurrency during PS restart)
behavior.

Before this patch, all of these timelines would all do their initial
logical size calculation in parallel, leading to extreme thrashing in
page cache and virtual file cache.

With this patch, the virtual file cache thrashing is reduced
significantly (from 80k `open`-system-calls/second to ~500
`open`-system-calls/second during loading).


### Critique

The obvious critique with above experiment is that there's no skipping
of the semaphore, i.e., the priority-boosting aspect of this PR is not
exercised.

If even just 1% of our 20k tenants in the setup were active in
SK/storage_broker, then 200 logical size calculations would skip the
limiting semaphore immediately after restart and run concurrently.

Further critique: given the two bugs wrt timeline inactive vs active
state that were mentioned in the Background section, we could have 75%
of our 20k tenants being (falsely) active on restart.

So... (next section)

This Doesn't Make Us Ready For Async VirtualFile
------------------------------------------------

This PR is a step towards asynchronous `VirtualFile`, aka, #5479 or even
#4744.

But it doesn't yet enable us to ship #5479.

The reason is that this PR doesn't limit the amount of high-priority
logical size computations.
If there are many high-priority logical size calculations requested,
we'll fall over like we did if #5479 is applied without this PR.
And currently, at very least due to the bugs mentioned in the Background
section, we run thousands of high-priority logical size calculations on
PS startup in prod.

So, at a minimum, we need to fix these bugs.

Then we can ship #5479 and #4744, and things will likely be fine under
normal operation.

But in high-traffic situations, overload problems will still be more
likely to happen, e.g., VirtualFile cache descriptor thrashing.
The solution candidates for that are orthogonal to this PR though:
* global concurrency limiting
* per-tenant rate limiting => #5899
* load shedding
* scaling bottleneck resources (fd cache size (neondatabase/cloud#8351),
page cache size(neondatabase/cloud#8351), spread load across more PSes,
etc)

Conclusion
----------

Even with the remarks from in the previous section, we should merge this
PR because:
1. it's an improvement over the status quo (esp. if the aforementioned
bugs wrt timeline active / inactive are fixed)
2. it prepares the way for
https://github.com/neondatabase/neon/pull/6010
3. it gets us close to shipping #5479 and #4744
---
 pageserver/src/bin/pageserver.rs              |  29 +-
 pageserver/src/consumption_metrics/metrics.rs |   7 +-
 pageserver/src/http/routes.rs                 |   3 +-
 pageserver/src/lib.rs                         |   7 -
 pageserver/src/metrics.rs                     |  27 +-
 pageserver/src/pgdatadir_mapping.rs           |   8 +
 pageserver/src/tenant.rs                      |  63 +--
 pageserver/src/tenant/delete.rs               |   7 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   4 +
 pageserver/src/tenant/tasks.rs                |   1 +
 pageserver/src/tenant/timeline.rs             | 407 +++++++++++-------
 pageserver/src/tenant/timeline/delete.rs      |   3 -
 .../src/tenant/timeline/logical_size.rs       |  22 +-
 .../walreceiver/walreceiver_connection.rs     |   5 +-
 test_runner/regress/test_ondemand_download.py |   2 +-
 .../regress/test_pageserver_restart.py        |   1 -
 16 files changed, 306 insertions(+), 290 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 542c1b7b30..43b35c6d08 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -402,15 +402,11 @@ fn start_pageserver(
     let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
     let (init_done_tx, init_done_rx) = utils::completion::channel();
 
-    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
-
     let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
 
     let order = pageserver::InitializationOrder {
         initial_tenant_load_remote: Some(init_done_tx),
         initial_tenant_load: Some(init_remote_done_tx),
-        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: Some(init_logical_size_done_tx),
         background_jobs_can_start: background_jobs_barrier.clone(),
     };
 
@@ -464,7 +460,7 @@ fn start_pageserver(
             });
 
             let WaitForPhaseResult {
-                timeout_remaining: timeout,
+                timeout_remaining: _timeout,
                 skipped: init_load_skipped,
             } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
 
@@ -472,26 +468,6 @@ fn start_pageserver(
 
             scopeguard::ScopeGuard::into_inner(guard);
 
-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial logical sizes completed")
-            });
-
-            let logical_sizes_done = std::pin::pin!(async {
-                init_logical_size_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_logical_sizes",
-                    "Initial logical sizes completed",
-                );
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: _,
-                skipped: logical_sizes_skipped,
-            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
-
-            scopeguard::ScopeGuard::into_inner(guard);
-
             // allow background jobs to start: we either completed prior stages, or they reached timeout
             // and were skipped.  It is important that we do not let them block background jobs indefinitely,
             // because things like consumption metrics for billing are blocked by this barrier.
@@ -514,9 +490,6 @@ fn start_pageserver(
             if let Some(f) = init_load_skipped {
                 f.await;
             }
-            if let Some(f) = logical_sizes_skipped {
-                f.await;
-            }
             scopeguard::ScopeGuard::into_inner(guard);
 
             startup_checkpoint(started_startup_at, "complete", "Startup complete");
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index c6ff91e560..d70f1fec4d 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -351,7 +351,12 @@ impl TimelineSnapshot {
 
             let current_exact_logical_size = {
                 let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
-                let size = span.in_scope(|| t.get_current_logical_size(ctx));
+                let size = span.in_scope(|| {
+                    t.get_current_logical_size(
+                        crate::tenant::timeline::GetLogicalSizePriority::Background,
+                        ctx,
+                    )
+                });
                 match size {
                     // Only send timeline logical size when it is fully calculated.
                     CurrentLogicalSize::Exact(ref size) => Some(size.into()),
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 29a1ff52e8..71b7ea05ec 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -338,7 +338,8 @@ async fn build_timeline_info_common(
         Lsn(0) => None,
         lsn @ Lsn(_) => Some(lsn),
     };
-    let current_logical_size = timeline.get_current_logical_size(ctx);
+    let current_logical_size =
+        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
     let remote_consistent_lsn_projected = timeline
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 3f74694ef2..0bdf096bfe 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -186,13 +186,6 @@ pub struct InitializationOrder {
     /// Each initial tenant load task carries this until completion.
     pub initial_tenant_load: Option<utils::completion::Completion>,
 
-    /// Barrier for when we can start initial logical size calculations.
-    pub initial_logical_size_can_start: utils::completion::Barrier,
-
-    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
-    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
-
     /// Barrier for when we can start any background jobs.
     ///
     /// This can be broken up later on, but right now there is just one class of a background job.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0cfbfcdf2f..6e311041ba 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -407,16 +407,14 @@ pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
 
-    use crate::task_mgr::TaskKind;
-
     pub(crate) struct StartCalculation(IntCounterVec);
     pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
         StartCalculation(
             register_int_counter_vec!(
                 "pageserver_initial_logical_size_start_calculation",
                 "Incremented each time we start an initial logical size calculation attempt. \
-                 The `task_kind` label is for the task kind that caused this attempt.",
-                &["attempt", "task_kind"]
+                 The `circumstances` label provides some additional details.",
+                &["attempt", "circumstances"]
             )
             .unwrap(),
         )
@@ -464,19 +462,24 @@ pub(crate) mod initial_logical_size {
         inc_drop_calculation: Option<IntCounter>,
     }
 
+    #[derive(strum_macros::IntoStaticStr)]
+    pub(crate) enum StartCircumstances {
+        EmptyInitial,
+        SkippedConcurrencyLimiter,
+        AfterBackgroundTasksRateLimit,
+    }
+
     impl StartCalculation {
-        pub(crate) fn first(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
-            let task_kind_label: &'static str =
-                causing_task_kind.map(|k| k.into()).unwrap_or_default();
-            self.0.with_label_values(&["first", task_kind_label]);
+        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
+            let circumstances_label: &'static str = circumstances.into();
+            self.0.with_label_values(&["first", circumstances_label]);
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
             }
         }
-        pub(crate) fn retry(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
-            let task_kind_label: &'static str =
-                causing_task_kind.map(|k| k.into()).unwrap_or_default();
-            self.0.with_label_values(&["retry", task_kind_label]);
+        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
+            let circumstances_label: &'static str = circumstances.into();
+            self.0.with_label_values(&["retry", circumstances_label]);
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
             }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 15d5609ceb..a448142158 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -282,6 +282,10 @@ impl Timeline {
     }
 
     /// Get a list of all existing relations in given tablespace and database.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     pub async fn list_rels(
         &self,
         spcnode: Oid,
@@ -630,6 +634,10 @@ impl Timeline {
     ///
     /// Only relation blocks are counted currently. That excludes metadata,
     /// SLRUs, twophase files etc.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     pub async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f67a4174af..0b2e48e1ff 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -472,7 +472,6 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
-        init_order: Option<&InitializationOrder>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -482,7 +481,6 @@ impl Tenant {
             &metadata,
             ancestor.clone(),
             resources,
-            init_order,
             CreateTimelineCause::Load,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
@@ -683,10 +681,6 @@ impl Tenant {
                     // as we are no longer loading, signal completion by dropping
                     // the completion while we resume deletion
                     drop(_completion);
-                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
-                    let _ = init_order
-                        .as_mut()
-                        .and_then(|x| x.initial_logical_size_attempt.take());
                     let background_jobs_can_start =
                         init_order.as_ref().map(|x| &x.background_jobs_can_start);
                     if let Some(background) = background_jobs_can_start {
@@ -700,7 +694,6 @@ impl Tenant {
                         &tenant_clone,
                         preload,
                         tenants,
-                        init_order,
                         &ctx,
                     )
                     .await
@@ -713,7 +706,7 @@ impl Tenant {
                     }
                 }
 
-                match tenant_clone.attach(init_order, preload, &ctx).await {
+                match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
                         tenant_clone.activate(broker_client, None, &ctx);
@@ -776,7 +769,6 @@ impl Tenant {
     ///
     async fn attach(
         self: &Arc<Tenant>,
-        init_order: Option<InitializationOrder>,
         preload: Option<TenantPreload>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
@@ -789,7 +781,7 @@ impl Tenant {
             None => {
                 // Deprecated dev mode: load from local disk state instead of remote storage
                 // https://github.com/neondatabase/neon/issues/5624
-                return self.load_local(init_order, ctx).await;
+                return self.load_local(ctx).await;
             }
         };
 
@@ -884,7 +876,6 @@ impl Tenant {
                 &index_part.metadata,
                 Some(remote_timeline_client),
                 self.deletion_queue_client.clone(),
-                None,
             )
             .await
             .context("resume_deletion")
@@ -1009,10 +1000,6 @@ impl Tenant {
             None
         };
 
-        // we can load remote timelines during init, but they are assumed to be so rare that
-        // initialization order is not passed to here.
-        let init_order = None;
-
         // timeline loading after attach expects to find metadata file for each metadata
         save_metadata(
             self.conf,
@@ -1030,7 +1017,6 @@ impl Tenant {
             Some(index_part),
             remote_metadata,
             ancestor,
-            init_order,
             ctx,
         )
         .await
@@ -1272,11 +1258,7 @@ impl Tenant {
     /// files on disk. Used at pageserver startup.
     ///
     /// No background tasks are started as part of this routine.
-    async fn load_local(
-        self: &Arc<Tenant>,
-        init_order: Option<InitializationOrder>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
         debug!("loading tenant task");
@@ -1302,7 +1284,7 @@ impl Tenant {
         // Process loadable timelines first
         for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
             if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false)
+                .load_local_timeline(timeline_id, local_metadata, ctx, false)
                 .await
             {
                 match e {
@@ -1336,13 +1318,7 @@ impl Tenant {
                 }
                 Some(local_metadata) => {
                     if let Err(e) = self
-                        .load_local_timeline(
-                            timeline_id,
-                            local_metadata,
-                            init_order.as_ref(),
-                            ctx,
-                            true,
-                        )
+                        .load_local_timeline(timeline_id, local_metadata, ctx, true)
                         .await
                     {
                         match e {
@@ -1370,12 +1346,11 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, init_order, ctx))]
+    #[instrument(skip(self, local_metadata, ctx))]
     async fn load_local_timeline(
         self: &Arc<Self>,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
-        init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
         found_delete_mark: bool,
     ) -> Result<(), LoadLocalTimelineError> {
@@ -1392,7 +1367,6 @@ impl Tenant {
                 &local_metadata,
                 None,
                 self.deletion_queue_client.clone(),
-                init_order,
             )
             .await
             .context("resume deletion")
@@ -1409,17 +1383,9 @@ impl Tenant {
             None
         };
 
-        self.timeline_init_and_sync(
-            timeline_id,
-            resources,
-            None,
-            local_metadata,
-            ancestor,
-            init_order,
-            ctx,
-        )
-        .await
-        .map_err(LoadLocalTimelineError::Load)
+        self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
+            .await
+            .map_err(LoadLocalTimelineError::Load)
     }
 
     pub(crate) fn tenant_id(&self) -> TenantId {
@@ -2314,7 +2280,6 @@ impl Tenant {
         new_metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
-        init_order: Option<&InitializationOrder>,
         cause: CreateTimelineCause,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
@@ -2329,9 +2294,6 @@ impl Tenant {
             CreateTimelineCause::Delete => TimelineState::Stopping,
         };
 
-        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
-        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
-
         let pg_version = new_metadata.pg_version();
 
         let timeline = Timeline::new(
@@ -2345,8 +2307,6 @@ impl Tenant {
             Arc::clone(&self.walredo_mgr),
             resources,
             pg_version,
-            initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned().flatten(),
             state,
             self.cancel.child_token(),
         );
@@ -3168,7 +3128,6 @@ impl Tenant {
                 new_metadata,
                 ancestor,
                 resources,
-                None,
                 CreateTimelineCause::Load,
             )
             .context("Failed to create timeline data structure")?;
@@ -3843,7 +3802,7 @@ pub(crate) mod harness {
             match mode {
                 LoadMode::Local => {
                     tenant
-                        .load_local(None, ctx)
+                        .load_local(ctx)
                         .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                 }
@@ -3853,7 +3812,7 @@ pub(crate) mod harness {
                         .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                     tenant
-                        .attach(None, Some(preload), ctx)
+                        .attach(Some(preload), ctx)
                         .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                 }
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b7b2ef9c79..548b173c0d 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -15,7 +15,6 @@ use crate::{
     context::RequestContext,
     task_mgr::{self, TaskKind},
     tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
-    InitializationOrder,
 };
 
 use super::{
@@ -390,7 +389,6 @@ impl DeleteTenantFlow {
         tenant: &Arc<Tenant>,
         preload: Option<TenantPreload>,
         tenants: &'static std::sync::RwLock<TenantsMap>,
-        init_order: Option<InitializationOrder>,
         ctx: &RequestContext,
     ) -> Result<(), DeleteTenantError> {
         let (_, progress) = completion::channel();
@@ -400,10 +398,7 @@ impl DeleteTenantFlow {
             .await
             .expect("cant be stopping or broken");
 
-        tenant
-            .attach(init_order, preload, ctx)
-            .await
-            .context("attach")?;
+        tenant.attach(preload, ctx).await.context("attach")?;
 
         Self::background(
             guard,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 3ed4e05bea..e203d9d334 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -230,6 +230,10 @@ impl Layer {
     ///
     /// It is up to the caller to collect more data from the previous layer and
     /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
     pub(crate) async fn get_value_reconstruct_data(
         &self,
         key: Key,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 138578ec8a..bc404c41a0 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -44,6 +44,7 @@ pub(crate) enum BackgroundLoopKind {
     Eviction,
     ConsumptionMetricsCollectMetrics,
     ConsumptionMetricsSyntheticSizeWorker,
+    InitialLogicalSizeCalculation,
 }
 
 impl BackgroundLoopKind {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e252ee584e..f02fd733b4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -20,23 +20,27 @@ use pageserver_api::{
     },
     shard::TenantShardId,
 };
+use rand::Rng;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
 use tokio::{
     runtime::Handle,
-    sync::{oneshot, watch, TryAcquireError},
+    sync::{oneshot, watch},
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{id::TenantTimelineId, sync::gate::Gate};
 
-use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
+use std::{
+    cmp::{max, min, Ordering},
+    ops::ControlFlow,
+};
 
 use crate::context::{
     AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
@@ -298,13 +302,6 @@ pub struct Timeline {
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 
-    /// Barrier to wait before doing initial logical size calculation. Used only during startup.
-    initial_logical_size_can_start: Option<completion::Barrier>,
-
-    /// Completion shared between all timelines loaded during startup; used to delay heavier
-    /// background tasks until some logical sizes have been calculated.
-    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
-
     /// Load or creation time information about the disk_consistent_lsn and when the loading
     /// happened. Used for consumption metrics.
     pub(crate) loaded_at: (Lsn, SystemTime),
@@ -453,6 +450,11 @@ pub enum LogicalSizeCalculationCause {
     TenantSizeHandler,
 }
 
+pub enum GetLogicalSizePriority {
+    User,
+    Background,
+}
+
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
@@ -489,6 +491,9 @@ impl Timeline {
     /// an ancestor branch, for example, or waste a lot of cycles chasing the
     /// non-existing key.
     ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     pub async fn get(
         &self,
         key: Key,
@@ -849,46 +854,6 @@ impl Timeline {
         }
     }
 
-    /// Retrieve current logical size of the timeline.
-    ///
-    /// The size could be lagging behind the actual number, in case
-    /// the initial size calculation has not been run (gets triggered on the first size access).
-    ///
-    /// return size and boolean flag that shows if the size is exact
-    pub(crate) fn get_current_logical_size(
-        self: &Arc<Self>,
-        ctx: &RequestContext,
-    ) -> logical_size::CurrentLogicalSize {
-        let current_size = self.current_logical_size.current_size();
-        debug!("Current size: {current_size:?}");
-
-        if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
-            (current_size, self.current_logical_size.initial_part_end)
-        {
-            self.try_spawn_size_init_task(initial_part_end, ctx);
-        }
-
-        if let CurrentLogicalSize::Approximate(_) = &current_size {
-            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
-                let first = self
-                    .current_logical_size
-                    .did_return_approximate_to_walreceiver
-                    .compare_exchange(
-                        false,
-                        true,
-                        AtomicOrdering::Relaxed,
-                        AtomicOrdering::Relaxed,
-                    )
-                    .is_ok();
-                if first {
-                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
-                }
-            }
-        }
-
-        current_size
-    }
-
     /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
     /// the in-memory layer, and initiate flushing it if so.
     ///
@@ -938,6 +903,7 @@ impl Timeline {
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
+        self.spawn_initial_logical_size_computation_task(ctx);
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
         self.launch_eviction_task(background_jobs_can_start);
@@ -1051,17 +1017,6 @@ impl Timeline {
                 error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
-                if matches!(
-                    new_state,
-                    TimelineState::Stopping | TimelineState::Broken { .. }
-                ) {
-                    // drop the completion guard, if any; it might be holding off the completion
-                    // forever needlessly
-                    self.initial_logical_size_attempt
-                        .lock()
-                        .unwrap_or_else(|e| e.into_inner())
-                        .take();
-                }
                 self.state.send_replace(new_state);
             }
         }
@@ -1383,8 +1338,6 @@ impl Timeline {
         walredo_mgr: Arc<super::WalRedoManager>,
         resources: TimelineResources,
         pg_version: u32,
-        initial_logical_size_can_start: Option<completion::Barrier>,
-        initial_logical_size_attempt: Option<completion::Completion>,
         state: TimelineState,
         cancel: CancellationToken,
     ) -> Arc<Self> {
@@ -1484,8 +1437,6 @@ impl Timeline {
                 ),
                 delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
 
-                initial_logical_size_can_start,
-                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
                 cancel,
                 gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
 
@@ -1797,39 +1748,91 @@ impl Timeline {
         Ok(())
     }
 
-    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
-        let state = self.current_state();
-        if matches!(
-            state,
-            TimelineState::Broken { .. } | TimelineState::Stopping
-        ) {
-            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-            return;
+    /// Retrieve current logical size of the timeline.
+    ///
+    /// The size could be lagging behind the actual number, in case
+    /// the initial size calculation has not been run (gets triggered on the first size access).
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub(crate) fn get_current_logical_size(
+        self: &Arc<Self>,
+        priority: GetLogicalSizePriority,
+        ctx: &RequestContext,
+    ) -> logical_size::CurrentLogicalSize {
+        let current_size = self.current_logical_size.current_size();
+        debug!("Current size: {current_size:?}");
+
+        match (current_size.accuracy(), priority) {
+            (logical_size::Accuracy::Exact, _) => (), // nothing to do
+            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
+                // background task will eventually deliver an exact value, we're in no rush
+            }
+            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
+                // background task is not ready, but user is asking for it now;
+                // => make the background task skip the line
+                // (The alternative would be to calculate the size here, but,
+                //  it can actually take a long time if the user has a lot of rels.
+                //  And we'll inevitable need it again; So, let the background task do the work.)
+                match self
+                    .current_logical_size
+                    .cancel_wait_for_background_loop_concurrency_limit_semaphore
+                    .get()
+                {
+                    Some(cancel) => cancel.cancel(),
+                    None => {
+                        let state = self.current_state();
+                        if matches!(
+                            state,
+                            TimelineState::Broken { .. } | TimelineState::Stopping
+                        ) {
+
+                            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
+                            // Don't make noise.
+                        } else {
+                            warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                        }
+                    }
+                };
+            }
         }
 
-        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
-            .try_acquire_owned()
-        {
-            Ok(permit) => permit,
-            Err(TryAcquireError::NoPermits) => {
-                // computation already ongoing or finished with success
-                return;
+        if let CurrentLogicalSize::Approximate(_) = &current_size {
+            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
+                let first = self
+                    .current_logical_size
+                    .did_return_approximate_to_walreceiver
+                    .compare_exchange(
+                        false,
+                        true,
+                        AtomicOrdering::Relaxed,
+                        AtomicOrdering::Relaxed,
+                    )
+                    .is_ok();
+                if first {
+                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
+                }
             }
-            Err(TryAcquireError::Closed) => unreachable!("we never call close"),
-        };
-        debug_assert!(self
-            .current_logical_size
-            .initial_logical_size
-            .get()
-            .is_none());
+        }
+
+        current_size
+    }
+
+    fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
+        let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
+            // nothing to do for freshly created timelines;
+            assert_eq!(
+                self.current_logical_size.current_size().accuracy(),
+                logical_size::Accuracy::Exact,
+            );
+            return;
+        };
+
+        let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new();
+        let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone();
+        self.current_logical_size
+            .cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token)
+            .expect("initial logical size calculation task must be spawned exactly once per Timeline object");
 
-        info!(
-            "spawning logical size computation from context of task kind {:?}",
-            ctx.task_kind()
-        );
-        let causing_task_kind = ctx.task_kind();
-        // We need to start the computation task.
-        // It gets a separate context since it will outlive the request that called this function.
         let self_clone = Arc::clone(self);
         let background_ctx = ctx.detached_child(
             TaskKind::InitialLogicalSizeCalculation,
@@ -1844,96 +1847,152 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-
                 let cancel = task_mgr::shutdown_token();
+                self_clone
+                    .initial_logical_size_calculation_task(
+                        initial_part_end,
+                        cancel_wait_for_background_loop_concurrency_limit_semaphore,
+                        cancel,
+                        background_ctx,
+                    )
+                    .await;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
+        );
+    }
 
-                // in case we were created during pageserver initialization, wait for
-                // initialization to complete before proceeding. startup time init runs on the same
-                // runtime.
-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); },
-                    _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
+    async fn initial_logical_size_calculation_task(
+        self: Arc<Self>,
+        initial_part_end: Lsn,
+        skip_concurrency_limiter: CancellationToken,
+        cancel: CancellationToken,
+        background_ctx: RequestContext,
+    ) {
+        enum BackgroundCalculationError {
+            Cancelled,
+            Other(anyhow::Error),
+        }
+
+        let try_once = |attempt: usize| {
+            let background_ctx = &background_ctx;
+            let self_ref = &self;
+            let skip_concurrency_limiter = &skip_concurrency_limiter;
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
+                    BackgroundLoopKind::InitialLogicalSizeCalculation,
+                    background_ctx,
+                    &cancel,
+                );
+
+                use crate::metrics::initial_logical_size::StartCircumstances;
+                let (_maybe_permit, circumstances) = tokio::select! {
+                    res = wait_for_permit => {
+                        match res {
+                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
+                            Err(RateLimitError::Cancelled) => {
+                                return Err(BackgroundCalculationError::Cancelled);
+                            }
+                        }
+                    }
+                    () = skip_concurrency_limiter.cancelled() => {
+                        // Some action that is part of a end user interaction requested logical size
+                        // => break out of the rate limit
+                        // TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
+                        // but then again what happens if they cancel; also, we should just be using
+                        // one runtime across the entire process, so, let's leave this for now.
+                        (None, StartCircumstances::SkippedConcurrencyLimiter)
+                    }
                 };
 
-
-
-                // hold off background tasks from starting until all timelines get to try at least
-                // once initial logical size calculation; though retry will rarely be useful.
-                // holding off is done because heavier tasks execute blockingly on the same
-                // runtime.
-                //
-                // dropping this at every outcome is probably better than trying to cling on to it,
-                // delay will be terminated by a timeout regardless.
-                let completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
-
-                let metrics_guard = match &completion {
-                    Some(_) => crate::metrics::initial_logical_size::START_CALCULATION.first(Some(causing_task_kind)),
-                    None => crate::metrics::initial_logical_size::START_CALCULATION.retry(Some(causing_task_kind)),
+                let metrics_guard = if attempt == 1 {
+                    crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
+                } else {
+                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                 };
 
-                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
+                match self_ref
+                    .logical_size_calculation_task(
+                        initial_part_end,
+                        LogicalSizeCalculationCause::Initial,
+                        background_ctx,
+                    )
                     .await
                 {
-                    Ok(s) => s,
+                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
                     Err(CalculateLogicalSizeError::Cancelled) => {
-                        // Don't make noise, this is a common task.
-                        // In the unlikely case that there is another call to this function, we'll retry
-                        // because initial_logical_size is still None.
-                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
-                        return Ok(());
+                        Err(BackgroundCalculationError::Cancelled)
                     }
                     Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
+                        if let Some(PageReconstructError::AncestorStopping(_)) =
                             err.root_cause().downcast_ref()
                         {
-                            // This can happen if the timeline parent timeline switches to
-                            // Stopping state while we're still calculating the initial
-                            // timeline size for the child, for example if the tenant is
-                            // being detached or the pageserver is shut down. Like with
-                            // CalculateLogicalSizeError::Cancelled, don't make noise.
-                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
-                            return Ok(());
+                            Err(BackgroundCalculationError::Cancelled)
+                        } else {
+                            Err(BackgroundCalculationError::Other(err))
                         }
-                        return Err(err.context("Failed to calculate logical size"));
-                    }
-                };
-
-                // we cannot query current_logical_size.current_size() to know the current
-                // *negative* value, only truncated to u64.
-                let added = self_clone
-                    .current_logical_size
-                    .size_added_after_initial
-                    .load(AtomicOrdering::Relaxed);
-
-                let sum = calculated_size.saturating_add_signed(added);
-
-                // set the gauge value before it can be set in `update_current_logical_size`.
-                self_clone.metrics.current_logical_size_gauge.set(sum);
-
-                match self_clone
-                    .current_logical_size
-                    .initial_logical_size
-                    .set((calculated_size, metrics_guard.calculation_result_saved()))
-                {
-                    Ok(()) => (),
-                    Err(_what_we_just_attempted_to_set) => {
-                        let (existing_size, _) = self_clone
-                            .current_logical_size
-                            .initial_logical_size
-                            .get()
-                            .expect("once_cell set was lost, then get failed, impossible.");
-                        // This shouldn't happen because the semaphore is initialized with 1.
-                        // But if it happens, just complain & report success so there are no further retries.
-                        error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                     }
                 }
-                // now that `initial_logical_size.is_some()`, reduce permit count to 0
-                // so that we prevent future callers from spawning this task
-                permit.forget();
-                Ok(())
-            }.in_current_span(),
-        );
+            }
+        };
+
+        let retrying = async {
+            let mut attempt = 0;
+            loop {
+                attempt += 1;
+
+                match try_once(attempt).await {
+                    Ok(res) => return ControlFlow::Continue(res),
+                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
+                    Err(BackgroundCalculationError::Other(e)) => {
+                        warn!(attempt, "initial size calculation failed: {e:?}");
+                        // exponential back-off doesn't make sense at these long intervals;
+                        // use fixed retry interval with generous jitter instead
+                        let sleep_duration = Duration::from_secs(
+                            u64::try_from(
+                                // 1hour base
+                                (60_i64 * 60_i64)
+                                    // 10min jitter
+                                    + rand::thread_rng().gen_range(-10 * 60..10 * 60),
+                            )
+                            .expect("10min < 1hour"),
+                        );
+                        tokio::time::sleep(sleep_duration).await;
+                    }
+                }
+            }
+        };
+
+        let (calculated_size, metrics_guard) = tokio::select! {
+            res = retrying  => {
+                match res {
+                    ControlFlow::Continue(calculated_size) => calculated_size,
+                    ControlFlow::Break(()) => return,
+                }
+            }
+            _ = cancel.cancelled() => {
+                return;
+            }
+        };
+
+        // we cannot query current_logical_size.current_size() to know the current
+        // *negative* value, only truncated to u64.
+        let added = self
+            .current_logical_size
+            .size_added_after_initial
+            .load(AtomicOrdering::Relaxed);
+
+        let sum = calculated_size.saturating_add_signed(added);
+
+        // set the gauge value before it can be set in `update_current_logical_size`.
+        self.metrics.current_logical_size_gauge.set(sum);
+
+        self.current_logical_size
+            .initial_logical_size
+            .set((calculated_size, metrics_guard.calculation_result_saved()))
+            .ok()
+            .expect("only this task sets it");
     }
 
     pub fn spawn_ondemand_logical_size_calculation(
@@ -1971,6 +2030,9 @@ impl Timeline {
         receiver
     }
 
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     #[instrument(skip_all)]
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
@@ -2008,6 +2070,10 @@ impl Timeline {
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
@@ -2123,6 +2189,10 @@ impl Timeline {
     ///
     /// This function takes the current timeline's locked LayerMap as an argument,
     /// so callers can avoid potential race conditions.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
     async fn get_reconstruct_data(
         &self,
         key: Key,
@@ -2371,6 +2441,9 @@ impl Timeline {
         }
     }
 
+    /// # Cancel-safety
+    ///
+    /// This method is cancellation-safe.
     async fn lookup_cached_page(
         &self,
         key: &Key,
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 497796c80a..2a103a7ff4 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -21,7 +21,6 @@ use crate::{
         },
         CreateTimelineCause, DeleteTimelineError, Tenant,
     },
-    InitializationOrder,
 };
 
 use super::{Timeline, TimelineResources};
@@ -407,7 +406,6 @@ impl DeleteTimelineFlow {
         local_metadata: &TimelineMetadata,
         remote_client: Option<RemoteTimelineClient>,
         deletion_queue_client: DeletionQueueClient,
-        init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
         // RemoteTimelineClient is the only functioning part.
@@ -420,7 +418,6 @@ impl DeleteTimelineFlow {
                     remote_client,
                     deletion_queue_client,
                 },
-                init_order,
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index a33fb28ebd..f2db8c91fc 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,11 +1,10 @@
 use anyhow::Context;
-use once_cell::sync::OnceCell;
 
-use tokio::sync::Semaphore;
+use once_cell::sync::OnceCell;
+use tokio_util::sync::CancellationToken;
 use utils::lsn::Lsn;
 
 use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
-use std::sync::Arc;
 
 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -28,8 +27,12 @@ pub(super) struct LogicalSize {
         crate::metrics::initial_logical_size::FinishedCalculationGuard,
     )>,
 
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
+    /// Cancellation for the best-effort logical size calculation.
+    ///
+    /// The token is kept in a once-cell so that we can error out if a higher priority
+    /// request comes in *before* we have started the normal logical size calculation.
+    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
+        OnceCell<CancellationToken>,
 
     /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
     pub initial_part_end: Option<Lsn>,
@@ -72,7 +75,7 @@ pub(crate) enum CurrentLogicalSize {
     Exact(Exact),
 }
 
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub(crate) enum Accuracy {
     Approximate,
     Exact,
@@ -115,11 +118,10 @@ impl LogicalSize {
         Self {
             initial_logical_size: OnceCell::with_value((0, {
                 crate::metrics::initial_logical_size::START_CALCULATION
-                    .first(None)
+                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
                     .calculation_result_saved()
             })),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
+            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
             did_return_approximate_to_walreceiver: AtomicBool::new(false),
@@ -129,7 +131,7 @@ impl LogicalSize {
     pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
         Self {
             initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
+            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
             did_return_approximate_to_walreceiver: AtomicBool::new(false),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 7045658f24..3bcb7ff891 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -397,7 +397,10 @@ pub(super) async fn handle_walreceiver_connection(
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
             let current_timeline_size = timeline
-                .get_current_logical_size(&ctx)
+                .get_current_logical_size(
+                    crate::tenant::timeline::GetLogicalSizePriority::User,
+                    &ctx,
+                )
                 // FIXME: https://github.com/neondatabase/neon/issues/5963
                 .size_dont_care_about_accuracy();
             let status_update = PageserverFeedback {
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index a4cd42b6c3..86a749eaf3 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -384,7 +384,7 @@ def test_download_remote_layers_api(
     env.pageserver.allowed_errors.extend(
         [
             ".*download failed: downloading evicted layer file failed.*",
-            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
+            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
         ]
     )
 
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 443b0812fd..3cac32b790 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -106,7 +106,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
         # Initial tenant load should reflect the delay we injected
         ("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p),
         # Subsequent steps should occur in expected order
-        ("initial_logical_sizes", lambda t, p: t > 0 and t >= p),
         ("background_jobs_can_start", lambda t, p: t > 0 and t >= p),
         ("complete", lambda t, p: t > 0 and t >= p),
     ]

From 85d08581edfa11de59db531c70b4650d5ec65361 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 4 Dec 2023 19:54:18 +0100
Subject: [PATCH 026/202] [compute_ctl] Introduce feature flags in the compute
 spec (#6016)

## Problem

In the past we've rolled out all new `compute_ctl` functionality right
to all users, which could be risky. I want to have a more fine-grained
control over what we enable, in which env and to which users.

## Summary of changes

Add an option to pass a list of feature flags to `compute_ctl`. If not
passed, it defaults to an empty list. Any unknown flags are ignored.

This allows us to release new experimental features safer, as we can
then flip the flag for one specific user, only Neon employees, free /
pro / etc. users and so on. Or control it per environment.

In the current implementation feature flags are passed via compute spec,
so they do not allow controlling behavior of `empty` computes. For them,
we can either stick with the previous approach, i.e. add separate cli
args or introduce a more generic `--features` cli argument.
---
 compute_tools/src/compute.rs  | 13 ++++++++++-
 control_plane/src/endpoint.rs |  1 +
 libs/compute_api/src/spec.rs  | 43 ++++++++++++++++++++++++++++++++++-
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 28770acdcd..9969b2166c 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
 
 use remote_storage::{DownloadError, RemotePath};
@@ -277,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }
 
 impl ComputeNode {
+    /// Check that compute node has corresponding feature enabled.
+    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
+        let state = self.state.lock().unwrap();
+
+        if let Some(s) = state.pspec.as_ref() {
+            s.spec.features.contains(&feature)
+        } else {
+            false
+        }
+    }
+
     pub fn set_status(&self, status: ComputeStatus) {
         let mut state = self.state.lock().unwrap();
         state.status = status;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 12b1250764..a566f03db9 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -519,6 +519,7 @@ impl Endpoint {
             skip_pg_catalog_updates: self.skip_pg_catalog_updates,
             format_version: 1.0,
             operation_uuid: None,
+            features: vec![],
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 175b4461ac..d9c384a5d3 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,6 +26,13 @@ pub struct ComputeSpec {
     // but we don't use it for anything. Serde will ignore missing fields when
     // deserializing it.
     pub operation_uuid: Option<String>,
+
+    /// Compute features to enable. These feature flags are provided, when we
+    /// know all the details about client's compute, so they cannot be used
+    /// to change `Empty` compute behavior.
+    #[serde(default)]
+    pub features: Vec<ComputeFeature>,
+
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
@@ -68,6 +75,19 @@ pub struct ComputeSpec {
     pub remote_extensions: Option<RemoteExtSpec>,
 }
 
+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeFeature {
+    // XXX: Add more feature flags here.
+
+    // This is a special feature flag that is used to represent unknown feature flags.
+    // Basically all unknown to enum flags are represented as this one. See unit test
+    // `parse_unknown_features()` for more details.
+    #[serde(other)]
+    UnknownFeature,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
     pub public_extensions: Option<Vec<String>>,
@@ -229,7 +249,10 @@ mod tests {
     #[test]
     fn parse_spec_file() {
         let file = File::open("tests/cluster_spec.json").unwrap();
-        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        // Features list defaults to empty vector.
+        assert!(spec.features.is_empty());
     }
 
     #[test]
@@ -241,4 +264,22 @@ mod tests {
         ob.insert("unknown_field_123123123".into(), "hello".into());
         let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
     }
+
+    #[test]
+    fn parse_unknown_features() {
+        // Test that unknown feature flags do not cause any errors.
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+
+        // Add unknown feature flags.
+        let features = vec!["foo_bar_feature", "baz_feature"];
+        ob.insert("features".into(), features.into());
+
+        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
+
+        assert!(spec.features.len() == 2);
+        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
+        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
+    }
 }

From bad686bb71083ae21bbb4bd1709715123b086440 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 4 Dec 2023 13:10:23 -0800
Subject: [PATCH 027/202] Remove trusted from wal2json (#6035)

## Problem

## Summary of changes
---
 Dockerfile.compute-node | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 425f163e8b..a3772265c0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -721,8 +721,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control
+    make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
 #

From c48918d329f5f03caa70af65bb615846d12813ec Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Dec 2023 11:03:07 +0100
Subject: [PATCH 028/202] Rename metric (#6030)

## Problem

It looks like because of reallocation of the buckets in previous PR, the
metric is broken in graphana.

## Summary of changes

Renamed the metric.
---
 proxy/src/proxy.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 36d01f9acc..3b6d9cb61d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -134,9 +134,9 @@ pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
 
 pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "semaphore_control_plane_token_acquire_seconds",
+        "proxy_control_plane_token_acquire_seconds",
         "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.00005ms = 2.15s
+        // largest bucket = 3^16 * 0.05ms = 2.15s
         exponential_buckets(0.00005, 3.0, 16).unwrap(),
     )
     .unwrap()

From f60e49fe8ec38a0a461558c37d4c308c4bd2cdfc Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 5 Dec 2023 10:24:16 +0000
Subject: [PATCH 029/202] proxy: fix panic in startup packet (#6032)

## Problem

Panic when less than 8 bytes is presented in a startup packet.

## Summary of changes

We need there to be a 4 byte message code, so the expected min length is
8.
---
 libs/pq_proto/src/lib.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 41fc206cd7..c52a21bcd3 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -289,10 +289,10 @@ impl FeStartupPacket {
         // We shouldn't advance `buf` as probably full message is not there yet,
         // so can't directly use Bytes::get_u32 etc.
         let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
         // which is less readable
         #[allow(clippy::manual_range_contains)]
-        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
             return Err(ProtocolError::Protocol(format!(
                 "invalid startup packet message length {}",
                 len
@@ -975,4 +975,10 @@ mod tests {
         let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
         assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
     }
+
+    #[test]
+    fn parse_fe_startup_packet_regression() {
+        let data = [0, 0, 0, 7, 0, 0, 0, 0];
+        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
+    }
 }

From 61fe9d360d3280bb04558152232986da0e3f9667 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 5 Dec 2023 12:01:55 +0000
Subject: [PATCH 030/202] pageserver: add Key->Shard mapping logic & use it in
 page service (#5980)

## Problem

When a pageserver receives a page service request identified by
TenantId, it must decide which `Tenant` object to route it to.

As in earlier PRs, this stuff is all a no-op for tenants with a single
shard: calls to `is_key_local` always return true without doing any
hashing on a single-shard ShardIdentity.

Closes: https://github.com/neondatabase/neon/issues/6026

## Summary of changes

- Carry immutable `ShardIdentity` objects in Tenant and Timeline. These
provide the information that Tenants/Timelines need to figure out which
shard is responsible for which Key.
- Augment `get_active_tenant_with_timeout` to take a `ShardSelector`
specifying how the shard should be resolved for this tenant. This mode
depends on the kind of request (e.g. basebackups always go to shard
zero).
- In `handle_get_page_at_lsn_request`, handle the case where the
Timeline we looked up at connection time is not the correct shard for
the page being requested. This can happen whenever one node holds
multiple shards for the same tenant. This is currently written as a
"slow path" with the optimistic expectation that usually we'll run with
one shard per pageserver, and the Timeline resolved at connection time
will be the one serving page requests. There is scope for optimization
here later, to avoid doing the full shard lookup for each page.
- Omit consumption metrics from nonzero shards: only the 0th shard is
responsible for tracing accurate relation sizes.

Note to reviewers:
- Testing of these changes is happening separately on the
`jcsp/sharding-pt1` branch, where we have hacked neon_local etc needed
to run a test_pg_regress.
- The main caveat to this implementation is that page service
connections still look up one Timeline when the connection is opened,
before they know which pages are going to be read. If there is one shard
per pageserver then this will always also be the Timeline that serves
page requests. However, if multiple shards are on one pageserver then
get page requests will incur the cost of looking up the correct Timeline
on each getpage request. We may look to improve this in future with a
"sticky" timeline per connection handler so that subsequent requests for
the same Timeline don't have to look up again, and/or by having postgres
pass a shard hint when connecting. This is tracked in the "Loose ends"
section of https://github.com/neondatabase/neon/issues/5507
---
 libs/pageserver_api/src/key.rs                |   4 +
 libs/pageserver_api/src/shard.rs              | 134 +++++++++++++++++-
 pageserver/src/consumption_metrics/metrics.rs |   6 +
 pageserver/src/page_service.rs                |  69 +++++++--
 pageserver/src/pgdatadir_mapping.rs           |   9 +-
 pageserver/src/tenant.rs                      |  14 ++
 pageserver/src/tenant/mgr.rs                  |  90 ++++++++++--
 pageserver/src/tenant/timeline.rs             |  12 +-
 8 files changed, 311 insertions(+), 27 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index b5350d6384..16715bc667 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -140,3 +140,7 @@ impl Key {
         })
     }
 }
+
+pub fn is_rel_block_key(key: &Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3510b4dbca..7bcc0ee4c6 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,5 +1,6 @@
 use std::{ops::RangeInclusive, str::FromStr};
 
+use crate::key::{is_rel_block_key, Key};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
@@ -302,6 +303,8 @@ pub struct ShardStripeSize(pub u32);
 pub struct ShardLayout(u8);
 
 const LAYOUT_V1: ShardLayout = ShardLayout(1);
+/// ShardIdentity uses a magic layout value to indicate if it is unusable
+const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
@@ -310,10 +313,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
-    pub layout: ShardLayout,
     pub number: ShardNumber,
     pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
+    stripe_size: ShardStripeSize,
+    layout: ShardLayout,
 }
 
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
@@ -339,6 +342,22 @@ impl ShardIdentity {
         }
     }
 
+    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
+    /// which are constructed in code paths that don't have access to proper configuration.
+    ///
+    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
+    /// Enforcement is via assertions, to avoid making our interface fallible for this
+    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
+    /// state, and by extension to avoid trying to do any page->shard resolution.
+    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            number,
+            count,
+            layout: LAYOUT_BROKEN,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
     pub fn is_unsharded(&self) -> bool {
         self.number == ShardNumber(0) && self.count == ShardCount(0)
     }
@@ -365,6 +384,33 @@ impl ShardIdentity {
             })
         }
     }
+
+    fn is_broken(&self) -> bool {
+        self.layout == LAYOUT_BROKEN
+    }
+
+    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
+        assert!(!self.is_broken());
+        key_to_shard_number(self.count, self.stripe_size, key)
+    }
+
+    /// Return true if the key should be ingested by this shard
+    pub fn is_key_local(&self, key: &Key) -> bool {
+        assert!(!self.is_broken());
+        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
+            true
+        } else {
+            key_to_shard_number(self.count, self.stripe_size, key) == self.number
+        }
+    }
+
+    pub fn shard_slug(&self) -> String {
+        if self.count > ShardCount(0) {
+            format!("-{:02x}{:02x}", self.number.0, self.count.0)
+        } else {
+            String::new()
+        }
+    }
 }
 
 impl Serialize for ShardIndex {
@@ -438,6 +484,65 @@ impl<'de> Deserialize<'de> for ShardIndex {
     }
 }
 
+/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
+/// in order to be able to serve basebackup requests without peer communication).
+fn key_is_shard0(key: &Key) -> bool {
+    // To decide what to shard out to shards >0, we apply a simple rule that only
+    // relation pages are distributed to shards other than shard zero. Everything else gets
+    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
+    // requests, and any request other than those for particular blocks in relations.
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn hash_combine(mut a: u32, mut b: u32) -> u32 {
+    b = b.wrapping_add(0x9e3779b9);
+    b = b.wrapping_add(a << 6);
+    b = b.wrapping_add(a >> 2);
+
+    a ^= b;
+    a
+}
+
+/// Where a Key is to be distributed across shards, select the shard.  This function
+/// does not account for keys that should be broadcast across shards.
+///
+/// The hashing in this function must exactly match what we do in postgres smgr
+/// code.  The resulting distribution of pages is intended to preserve locality within
+/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
+/// distributing data pseudo-randomly.
+///
+/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
+/// and will be handled at higher levels when shards are split.
+fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
+    // Fast path for un-sharded tenants or broadcast keys
+    if count < ShardCount(2) || key_is_shard0(key) {
+        return ShardNumber(0);
+    }
+
+    // relNode
+    let mut hash = murmurhash32(key.field4);
+    // blockNum/stripe size
+    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
+
+    ShardNumber((hash % count.0 as u32) as u8)
+}
+
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
@@ -609,4 +714,29 @@ mod tests {
 
         Ok(())
     }
+
+    // These are only smoke tests to spot check that our implementation doesn't
+    // deviate from a few examples values: not aiming to validate the overall
+    // hashing algorithm.
+    #[test]
+    fn murmur_hash() {
+        assert_eq!(murmurhash32(0), 0);
+
+        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
+    }
+
+    #[test]
+    fn shard_mapping() {
+        let key = Key {
+            field1: 0x00,
+            field2: 0x67f,
+            field3: 0x5,
+            field4: 0x400c,
+            field5: 0x00,
+            field6: 0x7d06,
+        };
+
+        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        assert_eq!(shard, ShardNumber(8));
+    }
 }
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index d70f1fec4d..918e45ea9e 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -2,6 +2,7 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
+use pageserver_api::shard::ShardNumber;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
     id::{TenantId, TimelineId},
@@ -228,6 +229,11 @@ where
     while let Some((tenant_id, tenant)) = tenants.next().await {
         let mut tenant_resident_size = 0;
 
+        // Sharded tenants report all consumption metrics from shard zero
+        if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
+            continue;
+        }
+
         for timeline in tenant.list_timelines() {
             let timeline_id = timeline.timeline_id;
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 82c16eb9bd..9956e761e6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,12 +53,14 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 
@@ -399,16 +401,19 @@ impl PageServerHandler {
     {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        // TODO(sharding): enumerate local tenant shards for this tenant, and select the one
-        // that should serve this request.
-
-        // Make request tracer if needed
+        // Note that since one connection may contain getpage requests that target different
+        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
+        // that we look up here may not be the one that serves all the actual requests: we will double
+        // check the mapping of key->shard later before calling into Timeline for getpage requests.
         let tenant = mgr::get_active_tenant_with_timeout(
             tenant_id,
+            ShardSelector::First,
             ACTIVE_TENANT_TIMEOUT,
             &task_mgr::shutdown_token(),
         )
         .await?;
+
+        // Make request tracer if needed
         let mut tracer = if tenant.get_trace_read_requests() {
             let connection_id = ConnectionId::generate();
             let path =
@@ -566,6 +571,7 @@ impl PageServerHandler {
         info!("creating new timeline");
         let tenant = get_active_tenant_with_timeout(
             tenant_id,
+            ShardSelector::Zero,
             ACTIVE_TENANT_TIMEOUT,
             &task_mgr::shutdown_token(),
         )
@@ -628,7 +634,7 @@ impl PageServerHandler {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
         let last_record_lsn = timeline.get_last_record_lsn();
         if last_record_lsn != start_lsn {
@@ -807,9 +813,49 @@ impl PageServerHandler {
         }
         */
 
-        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-            .await?;
+        let key = rel_block_to_key(req.rel, req.blkno);
+        let page = if timeline.get_shard_identity().is_key_local(&key) {
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        } else {
+            // The Tenant shard we looked up at connection start does not hold this particular
+            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
+            // has multiple shards for the same tenant.
+            //
+            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
+            let timeline = match self
+                .get_active_tenant_timeline(
+                    timeline.tenant_shard_id.tenant_id,
+                    timeline.timeline_id,
+                    ShardSelector::Page(key),
+                )
+                .await
+            {
+                Ok(t) => t,
+                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                    // We already know this tenant exists in general, because we resolved it at
+                    // start of connection.  Getting a NotFound here indicates that the shard containing
+                    // the requested page is not present on this node.
+
+                    // TODO: this should be some kind of structured error that the client will understand,
+                    // so that it can block until its config is updated: this error is expected in the case
+                    // that the Tenant's shards' placements are being updated and the client hasn't been
+                    // informed yet.
+                    //
+                    // https://github.com/neondatabase/neon/issues/6038
+                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
+                }
+                Err(e) => return Err(e.into()),
+            };
+
+            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
+            // the GateGuard was already held over the whole connection.
+            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        };
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
             page,
@@ -838,7 +884,7 @@ impl PageServerHandler {
 
         // check that the timeline exists
         let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
@@ -944,9 +990,11 @@ impl PageServerHandler {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        selector: ShardSelector,
     ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
         let tenant = get_active_tenant_with_timeout(
             tenant_id,
+            selector,
             ACTIVE_TENANT_TIMEOUT,
             &task_mgr::shutdown_token(),
         )
@@ -1120,7 +1168,7 @@ where
 
             self.check_permission(Some(tenant_id))?;
             let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id)
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
                 .await?;
 
             let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1307,6 +1355,7 @@ where
 
             let tenant = get_active_tenant_with_timeout(
                 tenant_id,
+                ShardSelector::Zero,
                 ACTIVE_TENANT_TIMEOUT,
                 &task_mgr::shutdown_token(),
             )
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a448142158..d37d953696 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,6 +13,7 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Buf, Bytes};
+use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -1322,7 +1323,7 @@ impl<'a> DatadirModification<'a> {
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::new();
         for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(key) || is_slru_block_key(key) {
+            if is_rel_block_key(&key) || is_slru_block_key(key) {
                 // This bails out on first error without modifying pending_updates.
                 // That's Ok, cf this function's doc comment.
                 writer.put(key, self.lsn, &value, ctx).await?;
@@ -1578,7 +1579,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
     }
 }
 
-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
     Key {
         field1: 0x00,
         field2: rel.spcnode,
@@ -1777,10 +1778,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     })
 }
 
-fn is_rel_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
-}
-
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0b2e48e1ff..65cfef1097 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -19,6 +19,7 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models::TimelineState;
+use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -236,6 +237,9 @@ pub struct Tenant {
 
     tenant_shard_id: TenantShardId,
 
+    // The detailed sharding information, beyond the number/count in tenant_shard_id
+    shard_identity: ShardIdentity,
+
     /// The remote storage generation, used to protect S3 objects from split-brain.
     /// Does not change over the lifetime of the [`Tenant`] object.
     ///
@@ -568,6 +572,7 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         resources: TenantSharedResources,
         attached_conf: AttachedTenantConf,
+        shard_identity: ShardIdentity,
         init_order: Option<InitializationOrder>,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         mode: SpawnMode,
@@ -589,6 +594,7 @@ impl Tenant {
             TenantState::Attaching,
             conf,
             attached_conf,
+            shard_identity,
             wal_redo_manager,
             tenant_shard_id,
             remote_storage.clone(),
@@ -1040,6 +1046,9 @@ impl Tenant {
             },
             conf,
             AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
+            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
+            // to occupy the slot for this TenantShardId.
+            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
             wal_redo_manager,
             tenant_shard_id,
             None,
@@ -2304,6 +2313,7 @@ impl Tenant {
             new_timeline_id,
             self.tenant_shard_id,
             self.generation,
+            self.shard_identity,
             Arc::clone(&self.walredo_mgr),
             resources,
             pg_version,
@@ -2321,6 +2331,7 @@ impl Tenant {
         state: TenantState,
         conf: &'static PageServerConf,
         attached_conf: AttachedTenantConf,
+        shard_identity: ShardIdentity,
         walredo_mgr: Arc<WalRedoManager>,
         tenant_shard_id: TenantShardId,
         remote_storage: Option<GenericRemoteStorage>,
@@ -2382,6 +2393,7 @@ impl Tenant {
 
         Tenant {
             tenant_shard_id,
+            shard_identity,
             generation: attached_conf.location.generation,
             conf,
             // using now here is good enough approximation to catch tenants with really long
@@ -3793,6 +3805,8 @@ pub(crate) mod harness {
                     self.generation,
                 ))
                 .unwrap(),
+                // This is a legacy/test code path: sharding isn't supported here.
+                ShardIdentity::unsharded(),
                 walredo_mgr,
                 self.tenant_shard_id,
                 Some(self.remote_storage.clone()),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f34d62ba53..8339e7b583 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,8 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::key::Key;
+use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -130,6 +131,18 @@ pub(crate) enum TenantsMapRemoveResult {
     InProgress(utils::completion::Barrier),
 }
 
+/// When resolving a TenantId to a shard, we may be looking for the 0th
+/// shard, or we might be looking for whichever shard holds a particular page.
+pub(crate) enum ShardSelector {
+    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
+    /// ignore it.
+    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
+    /// Pick the shard that holds this key
+    Page(Key),
+}
+
 impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -144,6 +157,49 @@ impl TenantsMap {
         }
     }
 
+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
+    /// resolve this to a fully qualified TenantShardId.
+    fn resolve_shard(
+        &self,
+        tenant_id: &TenantId,
+        selector: ShardSelector,
+    ) -> Option<TenantShardId> {
+        let mut want_shard = None;
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    match selector {
+                        ShardSelector::First => return Some(*slot.0),
+                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
+                            return Some(*slot.0)
+                        }
+                        ShardSelector::Page(key) => {
+                            if let Some(tenant) = slot.1.get_attached() {
+                                // First slot we see for this tenant, calculate the expected shard number
+                                // for the key: we will use this for checking if this and subsequent
+                                // slots contain the key, rather than recalculating the hash each time.
+                                if want_shard.is_none() {
+                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                                }
+
+                                if Some(tenant.shard_identity.number) == want_shard {
+                                    return Some(*slot.0);
+                                }
+                            } else {
+                                continue;
+                            }
+                        }
+                        _ => continue,
+                    }
+                }
+
+                // Fall through: we didn't find an acceptable shard
+                None
+            }
+        }
+    }
+
     /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
     ///
     /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -515,12 +571,14 @@ pub async fn init_tenant_mgr(
         location_conf.attach_in_generation(generation);
         Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
 
+        let shard_identity = location_conf.shard;
         match tenant_spawn(
             conf,
             tenant_shard_id,
             &tenant_dir_path,
             resources.clone(),
             AttachedTenantConf::try_from(location_conf)?,
+            shard_identity,
             Some(init_order.clone()),
             &TENANTS,
             SpawnMode::Normal,
@@ -561,6 +619,7 @@ pub(crate) fn tenant_spawn(
     tenant_path: &Utf8Path,
     resources: TenantSharedResources,
     location_conf: AttachedTenantConf,
+    shard_identity: ShardIdentity,
     init_order: Option<InitializationOrder>,
     tenants: &'static std::sync::RwLock<TenantsMap>,
     mode: SpawnMode,
@@ -593,6 +652,7 @@ pub(crate) fn tenant_spawn(
         tenant_shard_id,
         resources,
         location_conf,
+        shard_identity,
         init_order,
         tenants,
         mode,
@@ -762,12 +822,14 @@ pub(crate) async fn create_tenant(
         tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
     let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
 
+    let shard_identity = location_conf.shard;
     let created_tenant = tenant_spawn(
         conf,
         tenant_shard_id,
         &tenant_path,
         resources,
         AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
         None,
         &TENANTS,
         SpawnMode::Create,
@@ -860,6 +922,7 @@ impl TenantManager {
         Ok(())
     }
 
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
         tenant_shard_id: TenantShardId,
@@ -996,12 +1059,14 @@ impl TenantManager {
                     .await
                     .map_err(SetNewTenantConfigError::Persist)?;
 
+                let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
                     self.conf,
                     tenant_shard_id,
                     &tenant_path,
                     self.resources.clone(),
                     AttachedTenantConf::try_from(new_location_config)?,
+                    shard_identity,
                     None,
                     self.tenants,
                     SpawnMode::Normal,
@@ -1100,6 +1165,7 @@ pub(crate) enum GetActiveTenantError {
 /// then wait for up to `timeout` (minus however long we waited for the slot).
 pub(crate) async fn get_active_tenant_with_timeout(
     tenant_id: TenantId,
+    shard_selector: ShardSelector,
     timeout: Duration,
     cancel: &CancellationToken,
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1108,15 +1174,17 @@ pub(crate) async fn get_active_tenant_with_timeout(
         Tenant(Arc<Tenant>),
     }
 
-    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
-    // to decide which shard services the request)
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
     let wait_start = Instant::now();
     let deadline = wait_start + timeout;
 
-    let wait_for = {
+    let (wait_for, tenant_shard_id) = {
         let locked = TENANTS.read().unwrap();
+
+        // Resolve TenantId to TenantShardId
+        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
+            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
+        )?;
+
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
             .map_err(GetTenantError::MapState)?;
         match peek_slot {
@@ -1126,7 +1194,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                         // Fast path: we don't need to do any async waiting.
                         return Ok(tenant.clone());
                     }
-                    _ => WaitFor::Tenant(tenant.clone()),
+                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
                 }
             }
             Some(TenantSlot::Secondary) => {
@@ -1134,7 +1202,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
                     tenant_id,
                 )))
             }
-            Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
+            Some(TenantSlot::InProgress(barrier)) => {
+                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
+            }
             None => {
                 return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
                     tenant_id,
@@ -1377,12 +1447,14 @@ pub(crate) async fn load_tenant(
 
     Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
 
+    let shard_identity = location_conf.shard;
     let new_tenant = tenant_spawn(
         conf,
         tenant_shard_id,
         &tenant_path,
         resources,
         AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
         None,
         &TENANTS,
         SpawnMode::Normal,
@@ -1472,12 +1544,14 @@ pub(crate) async fn attach_tenant(
     // TODO: tenant directory remains on disk if we bail out from here on.
     //       See https://github.com/neondatabase/neon/issues/4233
 
+    let shard_identity = location_conf.shard;
     let attached_tenant = tenant_spawn(
         conf,
         tenant_shard_id,
         &tenant_dir,
         resources,
         AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
         None,
         &TENANTS,
         SpawnMode::Normal,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f02fd733b4..f666f1049f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,7 +18,7 @@ use pageserver_api::{
         DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
         TimelineState,
     },
-    shard::TenantShardId,
+    shard::{ShardIdentity, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -167,6 +167,10 @@ pub struct Timeline {
     /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
     pub(crate) generation: Generation,
 
+    /// The detailed sharding information from our parent Tenant.  This enables us to map keys
+    /// to shards, and is constant through the lifetime of this Timeline.
+    shard_identity: ShardIdentity,
+
     pub pg_version: u32,
 
     /// The tuple has two elements.
@@ -1335,6 +1339,7 @@ impl Timeline {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         generation: Generation,
+        shard_identity: ShardIdentity,
         walredo_mgr: Arc<super::WalRedoManager>,
         resources: TimelineResources,
         pg_version: u32,
@@ -1364,6 +1369,7 @@ impl Timeline {
                 timeline_id,
                 tenant_shard_id,
                 generation,
+                shard_identity,
                 pg_version,
                 layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
                 wanted_image_layers: Mutex::new(None),
@@ -2478,6 +2484,10 @@ impl Timeline {
         Ok(Arc::clone(ancestor))
     }
 
+    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
+        &self.shard_identity
+    }
+
     ///
     /// Get a handle to the latest layer for appending.
     ///

From bc1020f965c1b0f6381acfecf1ac0105c7d31113 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 5 Dec 2023 13:38:45 +0100
Subject: [PATCH 031/202] compute_ctl: Notify waiters when Postgres failed to
 start (#6034)

In case of configuring the empty compute, API handler is waiting on
condvar for compute state change. Yet, previously if Postgres failed to
start we were just setting compute status to `Failed` without notifying.
It causes a timeout on control plane side, although we can return a
proper error from compute earlier.

With this commit API handler should be properly notified.
---
 compute_tools/src/bin/compute_ctl.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 36e9ca0731..ce7345d5be 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,7 +274,13 @@ fn main() -> Result<()> {
             let mut state = compute.state.lock().unwrap();
             state.error = Some(format!("{:?}", err));
             state.status = ComputeStatus::Failed;
-            drop(state);
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
             delay_exit = true;
             None
         }

From be885370f63d42ee281f1a7ba2e36aeea6d7f5cc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 5 Dec 2023 15:03:07 +0000
Subject: [PATCH 032/202] pageserver: remove redundant unsafe_create_dir_all
 (#6040)

This non-fsyncing analog to our safe directory creation function was
just duplicating what tokio's fs::create_dir_all does.
---
 pageserver/src/tenant/mgr.rs | 47 ++----------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 8339e7b583..c09270112f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -270,49 +270,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
-/// Create a directory, including parents.  This does no fsyncs and makes
-/// no guarantees about the persistence of the resulting metadata: for
-/// use when creating dirs for use as cache.
-async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
-    let mut dirs_to_create = Vec::new();
-    let mut path: &Utf8Path = path.as_ref();
-
-    // Figure out which directories we need to create.
-    loop {
-        let meta = tokio::fs::metadata(path).await;
-        match meta {
-            Ok(metadata) if metadata.is_dir() => break,
-            Ok(_) => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::AlreadyExists,
-                    format!("non-directory found in path: {path}"),
-                ));
-            }
-            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(e),
-        }
-
-        dirs_to_create.push(path);
-
-        match path.parent() {
-            Some(parent) => path = parent,
-            None => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::InvalidInput,
-                    format!("can't find parent of path '{path}'"),
-                ));
-            }
-        }
-    }
-
-    // Create directories from parent to child.
-    for &path in dirs_to_create.iter().rev() {
-        tokio::fs::create_dir(path).await?;
-    }
-
-    Ok(())
-}
-
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -1035,7 +992,7 @@ impl TenantManager {
             LocationMode::Secondary(_) => {
                 // Directory doesn't need to be fsync'd because if we crash it can
                 // safely be recreated next time this tenant location is configured.
-                unsafe_create_dir_all(&tenant_path)
+                tokio::fs::create_dir_all(&tenant_path)
                     .await
                     .with_context(|| format!("Creating {tenant_path}"))?;
 
@@ -1051,7 +1008,7 @@ impl TenantManager {
                 // Directory doesn't need to be fsync'd because we do not depend on
                 // it to exist after crashes: it may be recreated when tenant is
                 // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                unsafe_create_dir_all(&timelines_path)
+                tokio::fs::create_dir_all(&tenant_path)
                     .await
                     .with_context(|| format!("Creating {timelines_path}"))?;
 

From da5e03b0d83fa85461a4e71fdceb4a7bb298c666 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 5 Dec 2023 15:38:27 +0000
Subject: [PATCH 033/202] pageserver: add a /reset API for tenants (#6014)

## Problem

Traditionally we would detach/attach directly with curl if we wanted to
"reboot" a single tenant. That's kind of inconvenient these days,
because one needs to know a generation number to issue an attach
request.

Closes: https://github.com/neondatabase/neon/issues/6011

## Summary of changes

- Introduce a new `/reset` API, which remembers the LocationConf from
the current attachment so that callers do not have to work out the
correct configuration/generation to use.
- As an additional support tool, allow an optional `drop_cache` query
parameter, for situations where we are concerned that some on-disk state
might be bad and want to clear that as well as the in-memory state.

One might wonder why I didn't call this "reattach" -- it's because
there's already a PS->CP API of that name and it could get confusing.
---
 pageserver/src/http/routes.rs             | 23 ++++++
 pageserver/src/tenant/mgr.rs              | 85 +++++++++++++++++++++--
 test_runner/fixtures/pageserver/http.py   |  8 +++
 test_runner/regress/test_tenant_detach.py | 25 +++++--
 4 files changed, 132 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 71b7ea05ec..14b667eeba 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -709,6 +709,26 @@ async fn tenant_detach_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn tenant_reset_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);
+    state
+        .tenant_manager
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_load_handler(
     mut request: Request<Body>,
     _cancel: CancellationToken,
@@ -1828,6 +1848,9 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_id/detach", |r| {
             api_handler(r, tenant_detach_handler)
         })
+        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
+            api_handler(r, tenant_reset_handler)
+        })
         .post("/v1/tenant/:tenant_id/load", |r| {
             api_handler(r, tenant_load_handler)
         })
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c09270112f..d9d44d1f8f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1038,6 +1038,81 @@ impl TenantManager {
 
         Ok(())
     }
+
+    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
+    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
+    /// dropped before re-attaching.
+    ///
+    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
+    /// where an issue is identified that would go away with a restart of the tenant.
+    ///
+    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
+    /// to respect the cancellation tokens used in normal shutdown().
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
+    pub(crate) async fn reset_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+        drop_cache: bool,
+        ctx: RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let Some(old_slot) = slot_guard.get_old_value() else {
+            anyhow::bail!("Tenant not found when trying to reset");
+        };
+
+        let Some(tenant) = old_slot.get_attached() else {
+            slot_guard.revert();
+            anyhow::bail!("Tenant is not in attached state");
+        };
+
+        let (_guard, progress) = utils::completion::channel();
+        match tenant.shutdown(progress, false).await {
+            Ok(()) => {
+                slot_guard.drop_old_value()?;
+            }
+            Err(_barrier) => {
+                slot_guard.revert();
+                anyhow::bail!("Cannot reset Tenant, already shutting down");
+            }
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+        if drop_cache {
+            tracing::info!("Dropping local file cache");
+
+            match tokio::fs::read_dir(&timelines_path).await {
+                Err(e) => {
+                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
+                }
+                Ok(mut entries) => {
+                    while let Some(entry) = entries.next_entry().await? {
+                        tokio::fs::remove_dir_all(entry.path()).await?;
+                    }
+                }
+            }
+        }
+
+        let shard_identity = config.shard;
+        let tenant = tenant_spawn(
+            self.conf,
+            tenant_shard_id,
+            &tenant_path,
+            self.resources.clone(),
+            AttachedTenantConf::try_from(config)?,
+            shard_identity,
+            None,
+            self.tenants,
+            SpawnMode::Normal,
+            &ctx,
+        )?;
+
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+
+        Ok(())
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1246,8 +1321,7 @@ pub(crate) async fn delete_tenant(
     // See https://github.com/neondatabase/neon/issues/5080
 
     // TODO(sharding): make delete API sharding-aware
-    let mut slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
 
     // unwrap is safe because we used MustExist mode when acquiring
     let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1574,9 +1648,10 @@ pub enum TenantSlotUpsertError {
     MapState(#[from] TenantMapError),
 }
 
-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 enum TenantSlotDropError {
     /// It is only legal to drop a TenantSlot if its contents are fully shut down
+    #[error("Tenant was not shut down")]
     NotShutdown,
 }
 
@@ -1636,9 +1711,9 @@ impl SlotGuard {
         }
     }
 
-    /// Take any value that was present in the slot before we acquired ownership
+    /// Get any value that was present in the slot before we acquired ownership
     /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&mut self) -> &Option<TenantSlot> {
+    fn get_old_value(&self) -> &Option<TenantSlot> {
         &self.old_value
     }
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 76aa40122f..eccab5fb6a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -260,6 +260,14 @@ class PageserverHttpClient(requests.Session):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
         self.verbose_error(res)
 
+    def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
+        params = {}
+        if drop_cache:
+            params["drop_cache"] = "true"
+
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
+        self.verbose_error(res)
+
     def tenant_delete(self, tenant_id: TenantId):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0bd3800480..df497c0f7b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,4 +1,5 @@
 import asyncio
+import enum
 import random
 import time
 from threading import Thread
@@ -51,11 +52,20 @@ def do_gc_target(
         log.info("gc http thread returning")
 
 
+class ReattachMode(str, enum.Enum):
+    REATTACH_EXPLICIT = "explicit"
+    REATTACH_RESET = "reset"
+    REATTACH_RESET_DROP = "reset"
+
+
 # Basic detach and re-attach test
 @pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+@pytest.mark.parametrize(
+    "mode",
+    [ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
+)
 def test_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
 ):
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
@@ -100,8 +110,15 @@ def test_tenant_reattach(
         ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
     )
 
-    pageserver_http.tenant_detach(tenant_id)
-    pageserver_http.tenant_attach(tenant_id)
+    if mode == ReattachMode.REATTACH_EXPLICIT:
+        # Explicitly detach then attach the tenant as two separate API calls
+        pageserver_http.tenant_detach(tenant_id)
+        pageserver_http.tenant_attach(tenant_id)
+    elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
+        # Use the reset API to detach/attach in one shot
+        pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
+    else:
+        raise NotImplementedError(mode)
 
     time.sleep(1)  # for metrics propagation
 

From 483caa22c6e4ba5c8a73526fe24fd5ca275b7c24 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 5 Dec 2023 16:11:15 +0000
Subject: [PATCH 034/202] pageserver: logging tweaks (#6039)

- The `Attaching tenant` log message omitted some useful information
like the generation and mode
- info-level messages about writing configuration files were
unnecessarily verbose
- During process shutdown, we don't emit logs about the various phases:
this is very cheap to log since we do it once per process lifetime, and
is helpful when figuring out where something got stuck during a hang.
---
 pageserver/src/lib.rs        | 6 +++---
 pageserver/src/tenant.rs     | 4 ++--
 pageserver/src/tenant/mgr.rs | 8 +++++++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 0bdf096bfe..797cb6f944 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -205,7 +205,7 @@ async fn timed<Fut: std::future::Future>(
     match tokio::time::timeout(warn_at, &mut fut).await {
         Ok(ret) => {
             tracing::info!(
-                task = name,
+                stage = name,
                 elapsed_ms = started.elapsed().as_millis(),
                 "completed"
             );
@@ -213,7 +213,7 @@ async fn timed<Fut: std::future::Future>(
         }
         Err(_) => {
             tracing::info!(
-                task = name,
+                stage = name,
                 elapsed_ms = started.elapsed().as_millis(),
                 "still waiting, taking longer than expected..."
             );
@@ -222,7 +222,7 @@ async fn timed<Fut: std::future::Future>(
 
             // this has a global allowed_errors
             tracing::warn!(
-                task = name,
+                stage = name,
                 elapsed_ms = started.elapsed().as_millis(),
                 "completed, took longer than expected"
             );
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 65cfef1097..d756e13316 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2515,7 +2515,7 @@ impl Tenant {
             }
         }
 
-        info!("persisting tenantconf to {config_path}");
+        debug!("persisting tenantconf to {config_path}");
 
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2550,7 +2550,7 @@ impl Tenant {
         target_config_path: &Utf8Path,
         tenant_conf: &TenantConfOpt,
     ) -> anyhow::Result<()> {
-        info!("persisting tenantconf to {target_config_path}");
+        debug!("persisting tenantconf to {target_config_path}");
 
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index d9d44d1f8f..12643cf61d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -603,7 +603,13 @@ pub(crate) fn tenant_spawn(
         "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
     );
 
-    info!("Attaching tenant {tenant_shard_id}");
+    info!(
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        generation = ?location_conf.location.generation,
+        attach_mode = ?location_conf.location.attach_mode,
+        "Attaching tenant"
+    );
     let tenant = match Tenant::spawn(
         conf,
         tenant_shard_id,

From 7fab731f65e5760ab6b649e367270bf3f99aa227 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 5 Dec 2023 18:49:24 +0200
Subject: [PATCH 035/202] Track size of FSM fork while applying records at
 replica (#5901)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1700560921471619

## Summary of changes

Update relation size cache for FSM fork in WAL records filter

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c                    | 127 ++++++++++++------
 .../regress/test_physical_replication.py      |  29 ++++
 2 files changed, 113 insertions(+), 43 deletions(-)
 create mode 100644 test_runner/regress/test_physical_replication.py

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 84b26198a7..74ffbdb371 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -59,6 +59,7 @@
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
+#include "storage/fsm_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
 #include "pgstat.h"
@@ -2722,6 +2723,86 @@ smgr_init_neon(void)
 }
 
 
+static void
+neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
+{
+	BlockNumber relsize;
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rinfo, forknum, &relsize))
+	{
+		if (relsize < blkno + 1)
+		{
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+		}
+	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		relsize = Max(nbresponse->n_blocks, blkno+1);
+
+		set_cached_relsize(rinfo, forknum, relsize);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+
+		elog(SmgrTrace, "Set length to %d", relsize);
+	}
+}
+
+#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
+
+/*
+ * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
+ */
+static BlockNumber
+get_fsm_physical_block(BlockNumber heapblk)
+{
+	BlockNumber pages;
+	int			leafno;
+	int			l;
+
+	/*
+	 * Calculate the logical page number of the first leaf page below the
+	 * given page.
+	 */
+	leafno = heapblk / SlotsPerFSMPage;
+
+	/* Count upper level nodes required to address the leaf page */
+	pages = 0;
+	for (l = 0; l < FSM_TREE_DEPTH; l++)
+	{
+		pages += leafno + 1;
+		leafno /= SlotsPerFSMPage;
+	}
+
+	/* Turn the page count into 0-based block number */
+	return pages - 1;
+}
+
+
 /*
  * Return whether we can skip the redo for this block.
  * 
@@ -2769,7 +2850,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	LWLock	   *partitionLock;
 	Buffer		buffer;
 	bool		no_redo_needed;
-	BlockNumber relsize;
 
 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
 		return true;
@@ -2819,49 +2899,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 
 	LWLockRelease(partitionLock);
 
-	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rinfo, forknum, &relsize))
+	neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
+	if (forknum == MAIN_FORKNUM)
 	{
-		if (relsize < blkno + 1)
-		{
-			update_cached_relsize(rinfo, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-		}
+		neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
 	}
-	else
-	{
-		/*
-		 * Size was not cached. We populate the cache now, with the size of the
-		 * relation measured after this WAL record is applied.
-		 *
-		 * This length is later reused when we open the smgr to read the block,
-		 * which is fine and expected.
-		 */
-
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
-				.tag = T_NeonNblocksRequest,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		Assert(nbresponse->n_blocks > blkno);
-
-		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
-		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-
-		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
-	}
-
 	return no_redo_needed;
 }
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
new file mode 100644
index 0000000000..034f2b669d
--- /dev/null
+++ b/test_runner/regress/test_physical_replication.py
@@ -0,0 +1,29 @@
+import random
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_physical_replication(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    n_records = 100000
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with primary.connect() as p_con:
+            with p_con.cursor() as p_cur:
+                p_cur.execute(
+                    "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
+                )
+        time.sleep(1)
+        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
+            with primary.connect() as p_con:
+                with p_con.cursor() as p_cur:
+                    with secondary.connect() as s_con:
+                        with s_con.cursor() as s_cur:
+                            for pk in range(n_records):
+                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
+                                s_cur.execute(
+                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
+                                )

From 987c9aaea058722c02ea96100e2c7d278ee4e77c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 6 Dec 2023 13:05:28 +0100
Subject: [PATCH 036/202] virtual_file: fix the metric for close() calls done
 by VirtualFile::drop (#6051)

Before this PR we would inc() the counter for `Close` even though the
slot's FD had already been closed.

Especially visible when subtracting `open` from `close+close-by-replace`
on a system that does a lot of attach and detach.

refs https://github.com/neondatabase/cloud/issues/8440
refs https://github.com/neondatabase/cloud/issues/8351
---
 pageserver/src/virtual_file.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b58b883ab6..7a6443361e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -610,9 +610,11 @@ impl Drop for VirtualFile {
             slot.recently_used.store(false, Ordering::Relaxed);
             // there is also operation "close-by-replace" for closes done on eviction for
             // comparison.
-            STORAGE_IO_TIME_METRIC
-                .get(StorageIoOperation::Close)
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
+            if let Some(fd) = slot_guard.file.take() {
+                STORAGE_IO_TIME_METRIC
+                    .get(StorageIoOperation::Close)
+                    .observe_closure_duration(|| drop(fd));
+            }
         }
     }
 }

From 7501ca6efb98868831c037b9500915867ca48ab5 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:12:36 +0100
Subject: [PATCH 037/202] Revert timescaledb for pg14 and pg15 (#6056)

```
could not start the compute node: compute is in state "failed": db error: ERROR: could not access file "$libdir/timescaledb-2.10.1": No such file or directory Caused by: ERROR: could not access file "$libdir/timescaledb-2.10.1": No such file or directory
```
---
 Dockerfile.compute-node | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a3772265c0..03280586f8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,10 +387,20 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        export TIMESCALEDB_VERSION=2.13.0 \
+        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
+        ;; \
+    esac && \
+    apt-get update && \
     apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
-    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
     mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
     ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \

From a3c7d400b4e3359df1e676e65078cb7198e89754 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Dec 2023 09:25:22 +0200
Subject: [PATCH 038/202] fix: avoid allocations with logging a slug (#6047)

to_string forces allocating a less than pointer sized string (costing on
stack 4 usize), using a Display formattable slug saves that. the
difference seems small, but at the same time, we log these a lot.
---
 libs/pageserver_api/src/shard.rs | 23 ++++++++++++++++-------
 pageserver/src/http/routes.rs    |  4 ++--
 pageserver/src/tenant/mgr.rs     |  2 +-
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 7bcc0ee4c6..f8f9449d86 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -73,19 +73,28 @@ impl TenantShardId {
         )
     }
 
-    pub fn shard_slug(&self) -> String {
-        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+}
+
+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
     }
 }
 
 impl std::fmt::Display for TenantShardId {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         if self.shard_count != ShardCount(0) {
-            write!(
-                f,
-                "{}-{:02x}{:02x}",
-                self.tenant_id, self.shard_number.0, self.shard_count.0
-            )
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
         } else {
             // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
             // is distinct from the normal single shard case (shard count == 1).
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 14b667eeba..9e41d912c2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -844,7 +844,7 @@ async fn tenant_delete_handler(
     mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
-            shard = tenant_shard_id.shard_slug()
+            shard = %tenant_shard_id.shard_slug()
         ))
         .await?;
 
@@ -1193,7 +1193,7 @@ async fn put_tenant_location_config_handler(
             mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                 .instrument(info_span!("tenant_detach",
                     tenant_id = %tenant_shard_id.tenant_id,
-                    shard = tenant_shard_id.shard_slug()
+                    shard = %tenant_shard_id.shard_slug()
                 ))
                 .await
         {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 12643cf61d..8466fe7fca 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1937,7 +1937,7 @@ fn tenant_map_acquire_slot_impl(
     METRICS.tenant_slot_writes.inc();
 
     let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
     let _guard = span.enter();
 
     let m = match &mut *locked {

From 31be301ef38b89da2f7d532ba3e869ea35b78514 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 7 Dec 2023 10:20:40 +0200
Subject: [PATCH 039/202] Make simple_rcu::RcuWaitList::wait() async (#6046)

The gc_timeline() function is async, but it calls the synchronous wait()
function. In the worst case, that could lead to a deadlock by using up
all tokio executor threads.

In the passing, fix a few typos in comments.

Fixes issue #6045.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/utils/src/simple_rcu.rs      | 71 ++++++++++++++++---------------
 pageserver/src/tenant/timeline.rs |  9 ++--
 2 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index 177a839d75..dc4a599111 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -1,10 +1,10 @@
 //!
 //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
 //! similar to a lock, but it allows readers to "hold on" to an old value of RCU
-//! without blocking writers, and allows writing a new values without blocking
-//! readers. When you update the new value, the new value is immediately visible
+//! without blocking writers, and allows writing a new value without blocking
+//! readers. When you update the value, the new value is immediately visible
 //! to new readers, but the update waits until all existing readers have
-//! finishe, so that no one sees the old value anymore.
+//! finished, so that on return, no one sees the old value anymore.
 //!
 //! This implementation isn't wait-free; it uses an RwLock that is held for a
 //! short duration when the value is read or updated.
@@ -26,6 +26,7 @@
 //! Increment the value by one, and wait for old readers to finish:
 //!
 //! ```
+//! # async fn dox() {
 //! # let rcu = utils::simple_rcu::Rcu::new(1);
 //! let write_guard = rcu.lock_for_write();
 //!
@@ -36,15 +37,17 @@
 //!
 //! // Concurrent reads and writes are now possible again. Wait for all the readers
 //! // that still observe the old value to finish.
-//! waitlist.wait();
+//! waitlist.wait().await;
+//! # }
 //! ```
 //!
 #![warn(missing_docs)]
 
 use std::ops::Deref;
-use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
 use std::sync::{Arc, Weak};
-use std::sync::{Mutex, RwLock, RwLockWriteGuard};
+use std::sync::{RwLock, RwLockWriteGuard};
+
+use tokio::sync::watch;
 
 ///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -68,22 +71,21 @@ struct RcuCell<V> {
     value: V,
 
     /// A dummy channel. We never send anything to this channel. The point is
-    /// that when the RcuCell is dropped, any cloned Senders will be notified
+    /// that when the RcuCell is dropped, any subscribed Receivers will be notified
     /// that the channel is closed. Updaters can use this to wait out until the
     /// RcuCell has been dropped, i.e. until the old value is no longer in use.
     ///
-    /// We never do anything with the receiver, we just need to hold onto it so
-    /// that the Senders will be notified when it's dropped. But because it's
-    /// not Sync, we need a Mutex on it.
-    watch: (SyncSender<()>, Mutex<Receiver<()>>),
+    /// We never send anything to this, we just need to hold onto it so that the
+    /// Receivers will be notified when it's dropped.
+    watch: watch::Sender<()>,
 }
 
 impl<V> RcuCell<V> {
     fn new(value: V) -> Self {
-        let (watch_sender, watch_receiver) = sync_channel(0);
+        let (watch_sender, _) = watch::channel(());
         RcuCell {
             value,
-            watch: (watch_sender, Mutex::new(watch_receiver)),
+            watch: watch_sender,
         }
     }
 }
@@ -141,10 +143,10 @@ impl<V> Deref for RcuReadGuard<V> {
 ///
 /// Write guard returned by `write`
 ///
-/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
-/// it should only be held for a short duration!
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
+/// held for a short duration!
 ///
-/// Calling `store` consumes the guard, making new reads and new writes possible
+/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
 /// again.
 ///
 pub struct RcuWriteGuard<'a, V> {
@@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
             // the watches for any that do.
             self.inner.old_cells.retain(|weak| {
                 if let Some(cell) = weak.upgrade() {
-                    watches.push(cell.watch.0.clone());
+                    watches.push(cell.watch.subscribe());
                     true
                 } else {
                     false
@@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
 ///
 /// List of readers who can still see old values.
 ///
-pub struct RcuWaitList(Vec<SyncSender<()>>);
+pub struct RcuWaitList(Vec<watch::Receiver<()>>);
 
 impl RcuWaitList {
     ///
     /// Wait for old readers to finish.
     ///
-    pub fn wait(mut self) {
+    pub async fn wait(mut self) {
         // after all the old_cells are no longer in use, we're done
         for w in self.0.iter_mut() {
             // This will block until the Receiver is closed. That happens when
             // the RcuCell is dropped.
             #[allow(clippy::single_match)]
-            match w.send(()) {
-                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
+            match w.changed().await {
+                Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
                 Err(_) => {
                     // closed, which means that the cell has been dropped, and
                     // its value is no longer in use
@@ -220,11 +222,10 @@ impl RcuWaitList {
 mod tests {
     use super::*;
     use std::sync::{Arc, Mutex};
-    use std::thread::{sleep, spawn};
     use std::time::Duration;
 
-    #[test]
-    fn two_writers() {
+    #[tokio::test]
+    async fn two_writers() {
         let rcu = Rcu::new(1);
 
         let read1 = rcu.read();
@@ -248,33 +249,35 @@ mod tests {
         assert_eq!(*read1, 1);
 
         let log = Arc::new(Mutex::new(Vec::new()));
-        // Wait for the old readers to finish in separate threads.
+        // Wait for the old readers to finish in separate tasks.
         let log_clone = Arc::clone(&log);
-        let thread2 = spawn(move || {
-            wait2.wait();
+        let task2 = tokio::spawn(async move {
+            wait2.wait().await;
             log_clone.lock().unwrap().push("wait2 done");
         });
         let log_clone = Arc::clone(&log);
-        let thread3 = spawn(move || {
-            wait3.wait();
+        let task3 = tokio::spawn(async move {
+            wait3.wait().await;
             log_clone.lock().unwrap().push("wait3 done");
         });
 
         // without this sleep the test can pass on accident if the writer is slow
-        sleep(Duration::from_millis(500));
+        tokio::time::sleep(Duration::from_millis(100)).await;
 
         // Release first reader. This allows first write to finish, but calling
-        // wait() on the second one would still block.
+        // wait() on the 'task3' would still block.
         log.lock().unwrap().push("dropping read1");
         drop(read1);
-        thread2.join().unwrap();
+        task2.await.unwrap();
 
-        sleep(Duration::from_millis(500));
+        assert!(!task3.is_finished());
+
+        tokio::time::sleep(Duration::from_millis(100)).await;
 
         // Release second reader, and finish second writer.
         log.lock().unwrap().push("dropping read2");
         drop(read2);
-        thread3.join().unwrap();
+        task3.await.unwrap();
 
         assert_eq!(
             log.lock().unwrap().as_slice(),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f666f1049f..882a5ef199 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -478,7 +478,7 @@ impl Timeline {
             .map(|ancestor| ancestor.timeline_id)
     }
 
-    /// Lock and get timeline's GC cuttof
+    /// Lock and get timeline's GC cutoff
     pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
         self.latest_gc_cutoff_lsn.read()
     }
@@ -3971,7 +3971,7 @@ impl Timeline {
         // for details. This will block until the old value is no longer in use.
         //
         // The GC cutoff should only ever move forwards.
-        {
+        let waitlist = {
             let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
             ensure!(
                 *write_guard <= new_gc_cutoff,
@@ -3979,8 +3979,9 @@ impl Timeline {
                 *write_guard,
                 new_gc_cutoff
             );
-            write_guard.store_and_unlock(new_gc_cutoff).wait();
-        }
+            write_guard.store_and_unlock(new_gc_cutoff)
+        };
+        waitlist.wait().await;
 
         info!("GC starting");
 

From 10c77cb410a53a7176cffb444d7bc2f7413598bd Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Dec 2023 11:01:08 +0200
Subject: [PATCH 040/202] temp: increase the wait tenant activation timeout
 (#6058)

5s is causing way too much noise; this is of course a temporary fix, we
should prioritize tenants for which there are pagestream openings the
highest, second highest the basebackups.

Deployment thread for context:
https://neondb.slack.com/archives/C03H1K0PGKH/p1701935048144479?thread_ts=1701765158.926659&cid=C03H1K0PGKH
---
 pageserver/src/page_service.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 9956e761e6..d5ca7f7382 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -67,9 +67,9 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
 // is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
 /// Read the end of a tar archive.
 ///

From 52718bb8ffad756754de405a82bed02e5ce04b0c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Dec 2023 11:39:40 +0200
Subject: [PATCH 041/202] fix(layer): metric splitting, span rename (#5902)

Per [feedback], split the Layer metrics, also finally account for lost
and [re-submitted feedback] on `layer_gc` by renaming it to
`layer_delete`, `Layer::garbage_collect_on_drop` renamed to
`Layer::delete_on_drop`. References to "gc" dropped from metric names
and elsewhere.

Also fixes how the cancellations were tracked: there was one rare
counter. Now there is a top level metric for cancelled inits, and the
rare "download failed but failed to communicate" counter is kept.

Fixes: #6027

[feedback]: https://github.com/neondatabase/neon/pull/5809#pullrequestreview-1720043251
[re-submitted feedback]: https://github.com/neondatabase/neon/pull/5108#discussion_r1401867311
---
 pageserver/src/metrics.rs                     |   2 +
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  | 287 ++++++++++--------
 .../src/tenant/timeline/layer_manager.rs      |   2 +-
 test_runner/regress/test_remote_storage.py    |   2 +-
 5 files changed, 162 insertions(+), 133 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6e311041ba..b684c4fa90 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2094,6 +2094,8 @@ pub fn preinitialize_metrics() {
     // Tenant manager stats
     Lazy::force(&TENANT_MANAGER);
 
+    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
+
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
         .into_iter()
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 944e05883f..6e9a4932d8 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,7 +4,7 @@ pub mod delta_layer;
 mod filename;
 pub mod image_layer;
 mod inmemory_layer;
-mod layer;
+pub(crate) mod layer;
 mod layer_desc;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e203d9d334..126d4d5563 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -222,8 +222,8 @@ impl Layer {
     ///
     /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
     /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
-    pub(crate) fn garbage_collect_on_drop(&self) {
-        self.0.garbage_collect_on_drop();
+    pub(crate) fn delete_on_drop(&self) {
+        self.0.delete_on_drop();
     }
 
     /// Return data needed to reconstruct given page at LSN.
@@ -331,10 +331,10 @@ impl Layer {
         Ok(())
     }
 
-    /// Waits until this layer has been dropped (and if needed, local garbage collection and remote
+    /// Waits until this layer has been dropped (and if needed, local file deletion and remote
     /// deletion scheduling has completed).
     ///
-    /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
+    /// Does not start local deletion, use [`Self::delete_on_drop`] for that
     /// separatedly.
     #[cfg(feature = "testing")]
     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
@@ -423,8 +423,8 @@ struct LayerInner {
     /// Initialization and deinitialization are done while holding a permit.
     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
 
-    /// Do we want to garbage collect this when `LayerInner` is dropped
-    wanted_garbage_collected: AtomicBool,
+    /// Do we want to delete locally and remotely this when `LayerInner` is dropped
+    wanted_deleted: AtomicBool,
 
     /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
     /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -438,10 +438,6 @@ struct LayerInner {
     version: AtomicUsize,
 
     /// Allow subscribing to when the layer actually gets evicted.
-    ///
-    /// If in future we need to implement "wait until layer instances are gone and done", carrying
-    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
-    /// method for "wait_gc" which will wait to this being closed.
     status: tokio::sync::broadcast::Sender<Status>,
 
     /// Counter for exponential backoff with the download
@@ -483,14 +479,14 @@ enum Status {
 
 impl Drop for LayerInner {
     fn drop(&mut self) {
-        if !*self.wanted_garbage_collected.get_mut() {
+        if !*self.wanted_deleted.get_mut() {
             // should we try to evict if the last wish was for eviction?
             // feels like there's some hazard of overcrowding near shutdown near by, but we don't
             // run drops during shutdown (yet)
             return;
         }
 
-        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
+        let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
 
         let path = std::mem::take(&mut self.path);
         let file_name = self.layer_desc().filename();
@@ -517,8 +513,8 @@ impl Drop for LayerInner {
                     false
                 }
                 Err(e) => {
-                    tracing::error!("failed to remove garbage collected layer: {e}");
-                    LAYER_IMPL_METRICS.inc_gc_removes_failed();
+                    tracing::error!("failed to remove wanted deleted layer: {e}");
+                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
                     false
                 }
             };
@@ -540,15 +536,15 @@ impl Drop for LayerInner {
                         } else {
                             tracing::warn!("scheduling deletion on drop failed: {e:#}");
                         }
-                        LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
+                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
                     } else {
-                        LAYER_IMPL_METRICS.inc_completed_gcs();
+                        LAYER_IMPL_METRICS.inc_completed_deletes();
                     }
                 }
             } else {
                 // no need to nag that timeline is gone: under normal situation on
                 // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
             }
         });
     }
@@ -583,7 +579,7 @@ impl LayerInner {
             timeline: Arc::downgrade(timeline),
             have_remote_client: timeline.remote_client.is_some(),
             access_stats,
-            wanted_garbage_collected: AtomicBool::new(false),
+            wanted_deleted: AtomicBool::new(false),
             wanted_evicted: AtomicBool::new(false),
             inner,
             version: AtomicUsize::new(version),
@@ -594,16 +590,13 @@ impl LayerInner {
         }
     }
 
-    fn garbage_collect_on_drop(&self) {
-        let res = self.wanted_garbage_collected.compare_exchange(
-            false,
-            true,
-            Ordering::Release,
-            Ordering::Relaxed,
-        );
+    fn delete_on_drop(&self) {
+        let res =
+            self.wanted_deleted
+                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
 
         if res.is_ok() {
-            LAYER_IMPL_METRICS.inc_started_gcs();
+            LAYER_IMPL_METRICS.inc_started_deletes();
         }
     }
 
@@ -671,6 +664,10 @@ impl LayerInner {
                 // disable any scheduled but not yet running eviction deletions for this
                 let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
+                // count cancellations, which currently remain largely unexpected
+                let init_cancelled =
+                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
                 // no need to make the evict_and_wait wait for the actual download to complete
                 drop(self.status.send(Status::Downloaded));
 
@@ -679,6 +676,8 @@ impl LayerInner {
                     .upgrade()
                     .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
+                // FIXME: grab a gate
+
                 let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
                 // check if we really need to be downloaded; could have been already downloaded by a
@@ -739,6 +738,8 @@ impl LayerInner {
                     tracing::info!(waiters, "completing the on-demand download for other tasks");
                 }
 
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+
                 Ok((ResidentOrWantedEvicted::Resident(res), permit))
             };
 
@@ -867,14 +868,13 @@ impl LayerInner {
                     match res {
                         (Ok(()), _) => {
                             // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded
-                            // or redownload.
+                            // else requests the layer, they'll find it already downloaded.
                             //
-                            // however, could be that we should consider marking the layer
-                            // for eviction? alas, cannot: because only DownloadedLayer
-                            // will handle that.
-                            tracing::info!("layer file download completed after requester had cancelled");
-                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
+                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
+                            //
+                            // FIXME(#6028): however, could be that we should consider marking the
+                            // layer for eviction? alas, cannot: because only DownloadedLayer will
+                            // handle that.
                         },
                         (Err(e), _) => {
                             // our caller is cancellation safe, but we might be racing with
@@ -994,12 +994,15 @@ impl LayerInner {
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
     fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
+        let delete = self.wanted_deleted.load(Ordering::Acquire);
         let evict = self.wanted_evicted.load(Ordering::Acquire);
         let can_evict = self.have_remote_client;
 
-        if gc {
-            // do nothing now, only in LayerInner::drop
+        if delete {
+            // do nothing now, only in LayerInner::drop -- this was originally implemented because
+            // we could had already scheduled the deletion at the time.
+            //
+            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
         } else if can_evict && evict {
             let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
 
@@ -1014,7 +1017,7 @@ impl LayerInner {
             crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
                 let _g = span.entered();
 
-                // if LayerInner is already dropped here, do nothing because the garbage collection
+                // if LayerInner is already dropped here, do nothing because the delete on drop
                 // has already ran while we were in queue
                 let Some(this) = this.upgrade() else {
                     LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1405,35 +1408,37 @@ impl From<ResidentLayer> for Layer {
     }
 }
 
-use metrics::{IntCounter, IntCounterVec};
+use metrics::IntCounter;
 
-struct LayerImplMetrics {
+pub(crate) struct LayerImplMetrics {
     started_evictions: IntCounter,
     completed_evictions: IntCounter,
-    cancelled_evictions: IntCounterVec,
+    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
 
-    started_gcs: IntCounter,
-    completed_gcs: IntCounter,
-    failed_gcs: IntCounterVec,
+    started_deletes: IntCounter,
+    completed_deletes: IntCounter,
+    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
 
-    rare_counters: IntCounterVec,
+    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
+    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
 }
 
 impl Default for LayerImplMetrics {
     fn default() -> Self {
-        let evictions = metrics::register_int_counter_vec!(
-            "pageserver_layer_evictions_count",
-            "Evictions started and completed in the Layer implementation",
-            &["state"]
+        use enum_map::Enum;
+
+        // reminder: these will be pageserver_layer_* with "_total" suffix
+
+        let started_evictions = metrics::register_int_counter!(
+            "pageserver_layer_started_evictions",
+            "Evictions started in the Layer implementation"
+        )
+        .unwrap();
+        let completed_evictions = metrics::register_int_counter!(
+            "pageserver_layer_completed_evictions",
+            "Evictions completed in the Layer implementation"
         )
         .unwrap();
-
-        let started_evictions = evictions
-            .get_metric_with_label_values(&["started"])
-            .unwrap();
-        let completed_evictions = evictions
-            .get_metric_with_label_values(&["completed"])
-            .unwrap();
 
         let cancelled_evictions = metrics::register_int_counter_vec!(
             "pageserver_layer_cancelled_evictions_count",
@@ -1442,24 +1447,36 @@ impl Default for LayerImplMetrics {
         )
         .unwrap();
 
-        // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
-        let gcs = metrics::register_int_counter_vec!(
-            "pageserver_layer_gcs_count",
-            "Garbage collections started and completed in the Layer implementation",
-            &["state"]
+        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = EvictionCancelled::from_usize(i);
+            let s = reason.as_str();
+            cancelled_evictions.with_label_values(&[s])
+        }));
+
+        let started_deletes = metrics::register_int_counter!(
+            "pageserver_layer_started_deletes",
+            "Deletions on drop pending in the Layer implementation"
+        )
+        .unwrap();
+        let completed_deletes = metrics::register_int_counter!(
+            "pageserver_layer_completed_deletes",
+            "Deletions on drop completed in the Layer implementation"
         )
         .unwrap();
 
-        let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
-        let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
-
-        let failed_gcs = metrics::register_int_counter_vec!(
-            "pageserver_layer_failed_gcs_count",
-            "Different reasons for garbage collections to have failed",
+        let failed_deletes = metrics::register_int_counter_vec!(
+            "pageserver_layer_failed_deletes_count",
+            "Different reasons for deletions on drop to have failed",
             &["reason"]
         )
         .unwrap();
 
+        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = DeleteFailed::from_usize(i);
+            let s = reason.as_str();
+            failed_deletes.with_label_values(&[s])
+        }));
+
         let rare_counters = metrics::register_int_counter_vec!(
             "pageserver_layer_assumed_rare_count",
             "Times unexpected or assumed rare event happened",
@@ -1467,16 +1484,29 @@ impl Default for LayerImplMetrics {
         )
         .unwrap();
 
+        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let event = RareEvent::from_usize(i);
+            let s = event.as_str();
+            rare_counters.with_label_values(&[s])
+        }));
+
+        let inits_cancelled = metrics::register_int_counter!(
+            "pageserver_layer_inits_cancelled_count",
+            "Times Layer initialization was cancelled",
+        )
+        .unwrap();
+
         Self {
             started_evictions,
             completed_evictions,
             cancelled_evictions,
 
-            started_gcs,
-            completed_gcs,
-            failed_gcs,
+            started_deletes,
+            completed_deletes,
+            failed_deletes,
 
             rare_counters,
+            inits_cancelled,
         }
     }
 }
@@ -1489,57 +1519,33 @@ impl LayerImplMetrics {
         self.completed_evictions.inc();
     }
     fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
-        self.cancelled_evictions
-            .get_metric_with_label_values(&[reason.as_str()])
-            .unwrap()
-            .inc()
+        self.cancelled_evictions[reason].inc()
     }
 
-    fn inc_started_gcs(&self) {
-        self.started_gcs.inc();
+    fn inc_started_deletes(&self) {
+        self.started_deletes.inc();
     }
-    fn inc_completed_gcs(&self) {
-        self.completed_gcs.inc();
+    fn inc_completed_deletes(&self) {
+        self.completed_deletes.inc();
     }
-    fn inc_gcs_failed(&self, reason: GcFailed) {
-        self.failed_gcs
-            .get_metric_with_label_values(&[reason.as_str()])
-            .unwrap()
-            .inc();
+    fn inc_deletes_failed(&self, reason: DeleteFailed) {
+        self.failed_deletes[reason].inc();
     }
 
-    /// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
-    /// failure to delete local file.
-    fn inc_gc_removes_failed(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["gc_remove_failed"])
-            .unwrap()
-            .inc();
+    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
+    /// attempt regardless of failure to delete local file.
+    fn inc_delete_removes_failed(&self) {
+        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
     }
 
-    /// Expected rare because requires a race with `evict_blocking` and
-    /// `get_or_maybe_download`.
+    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
     fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["retried_gomd"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
     }
 
-    /// Expected rare because cancellations are unexpected
-    fn inc_download_completed_without_requester(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["download_completed_without"])
-            .unwrap()
-            .inc();
-    }
-
-    /// Expected rare because cancellations are unexpected
+    /// Expected rare because cancellations are unexpected, and failures are unexpected
     fn inc_download_failed_without_requester(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["download_failed_without"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
     }
 
     /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1547,37 +1553,30 @@ impl LayerImplMetrics {
     /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
     /// Option.
     fn inc_raced_wanted_evicted_accesses(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["raced_wanted_evicted"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
     }
 
-    /// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
+    /// These are only expected for [`Self::inc_init_cancelled`] amount when
     /// running with remote storage.
     fn inc_init_needed_no_download(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["init_needed_no_download"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::InitWithoutDownload].inc();
     }
 
     /// Expected rare because all layer files should be readable and good
     fn inc_permanent_loading_failures(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["permanent_loading_failure"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
     }
 
     fn inc_broadcast_lagged(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["broadcast_lagged"])
-            .unwrap()
-            .inc();
+        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
+    }
+
+    fn inc_init_cancelled(&self) {
+        self.inits_cancelled.inc()
     }
 }
 
+#[derive(enum_map::Enum)]
 enum EvictionCancelled {
     LayerGone,
     TimelineGone,
@@ -1606,19 +1605,47 @@ impl EvictionCancelled {
     }
 }
 
-enum GcFailed {
+#[derive(enum_map::Enum)]
+enum DeleteFailed {
     TimelineGone,
     DeleteSchedulingFailed,
 }
 
-impl GcFailed {
+impl DeleteFailed {
     fn as_str(&self) -> &'static str {
         match self {
-            GcFailed::TimelineGone => "timeline_gone",
-            GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+            DeleteFailed::TimelineGone => "timeline_gone",
+            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
         }
     }
 }
 
-static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+#[derive(enum_map::Enum)]
+enum RareEvent {
+    RemoveOnDropFailed,
+    RetriedGetOrMaybeDownload,
+    DownloadFailedWithoutRequester,
+    UpgradedWantedEvicted,
+    InitWithoutDownload,
+    PermanentLoadingFailure,
+    EvictAndWaitLagged,
+}
+
+impl RareEvent {
+    fn as_str(&self) -> &'static str {
+        use RareEvent::*;
+
+        match self {
+            RemoveOnDropFailed => "remove_on_drop_failed",
+            RetriedGetOrMaybeDownload => "retried_gomd",
+            DownloadFailedWithoutRequester => "download_failed_without",
+            UpgradedWantedEvicted => "raced_wanted_evicted",
+            InitWithoutDownload => "init_needed_no_download",
+            PermanentLoadingFailure => "permanent_loading_failure",
+            EvictAndWaitLagged => "broadcast_lagged",
+        }
+    }
+}
+
+pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
     once_cell::sync::Lazy::new(LayerImplMetrics::default);
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index dcd82949dd..e38f5be209 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -243,7 +243,7 @@ impl LayerManager {
         //      map index without actually rebuilding the index.
         updates.remove_historic(desc);
         mapping.remove(layer);
-        layer.garbage_collect_on_drop();
+        layer.delete_on_drop();
     }
 
     pub(crate) fn contains(&self, layer: &Layer) -> bool {
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 9c2bb2db11..67daf5f901 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -840,7 +840,7 @@ def test_compaction_waits_for_upload(
     ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
 
     def layer_deletes_completed():
-        m = client.get_metric_value("pageserver_layer_gcs_count_total", {"state": "completed"})
+        m = client.get_metric_value("pageserver_layer_completed_deletes_total")
         if m is None:
             return 0
         return int(m)

From b7ffe24426ab01a528e04f840075323489cca552 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Dec 2023 13:31:38 +0200
Subject: [PATCH 042/202] build: update tokio to 1.34.0, tokio-utils 0.7.10
 (#6061)

We should still remember to bump minimum crates for libraries beginning
to use task tracker.
---
 Cargo.lock | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5639665758..f6630cc203 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2634,14 +2634,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.6"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
+checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
 dependencies = [
  "libc",
  "log",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.45.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -3644,7 +3644,7 @@ dependencies = [
  "serde_json",
  "sha2",
  "smol_str",
- "socket2 0.5.3",
+ "socket2 0.5.5",
  "sync_wrapper",
  "task-local-extensions",
  "thiserror",
@@ -4731,9 +4731,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.3"
+version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
+checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
 dependencies = [
  "libc",
  "windows-sys 0.48.0",
@@ -5080,18 +5080,18 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.28.1"
+version = "1.34.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
+checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
 dependencies = [
- "autocfg",
+ "backtrace",
  "bytes",
  "libc",
  "mio",
  "num_cpus",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2 0.4.9",
+ "socket2 0.5.5",
  "tokio-macros",
  "windows-sys 0.48.0",
 ]
@@ -5108,9 +5108,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5145,7 +5145,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
- "socket2 0.5.3",
+ "socket2 0.5.5",
  "tokio",
  "tokio-util",
 ]
@@ -5214,9 +5214,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.8"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
  "bytes",
  "futures-core",

From f9401fdd3194a9fed42674b63471f1c97d91d53a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Dec 2023 11:47:16 +0000
Subject: [PATCH 043/202] proxy: fix channel binding error messages (#6054)

## Problem

For channel binding failed messages we were still saying "channel
binding not supported" in the errors.

## Summary of changes

Fix error messages
---
 proxy/src/sasl.rs           | 6 ++++--
 proxy/src/scram/exchange.rs | 8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 6d1dd9fba5..da1cf21c6a 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -30,6 +30,9 @@ pub enum Error {
     #[error("Bad client message: {0}")]
     BadClientMessage(&'static str),
 
+    #[error("Internal error: missing digest")]
+    MissingBinding,
+
     #[error(transparent)]
     Io(#[from] io::Error),
 }
@@ -38,8 +41,7 @@ impl UserFacingError for Error {
     fn to_string_client(&self) -> String {
         use Error::*;
         match self {
-            // TODO: add support for channel binding
-            ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(),
+            ChannelBindingFailed(m) => m.to_string(),
             ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
             _ => "authentication protocol violation".to_string(),
         }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 319d9b1014..facaba3798 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -106,14 +106,14 @@ impl sasl::Mechanism for Exchange<'_> {
 
                 let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point {
                     config::TlsServerEndPoint::Sha256(x) => Ok(x),
-                    config::TlsServerEndPoint::Undefined => {
-                        Err(SaslError::ChannelBindingFailed("no cert digest provided"))
-                    }
+                    config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
                 })?;
 
                 // This might've been caused by a MITM attack
                 if client_final_message.channel_binding != channel_binding {
-                    return Err(SaslError::ChannelBindingFailed("data mismatch"));
+                    return Err(SaslError::ChannelBindingFailed(
+                        "insecure connection: secure channel data mismatch",
+                    ));
                 }
 
                 if client_final_message.nonce != server_first_message.nonce() {

From e89e41f8baab2006a8e41f14b6e3968dddd12f86 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Dec 2023 12:27:16 +0000
Subject: [PATCH 044/202] tests: update for tenant generations (#5449)

## Problem

Some existing tests are written in a way that's incompatible with tenant
generations.

## Summary of changes

Update all the tests that need updating: this is things like calling
through the NeonPageserver.tenant_attach helper to get a generation
number, instead of calling directly into the pageserver API. There are
various more subtle cases.
---
 control_plane/src/bin/attachment_service.rs   |  13 +
 libs/pageserver_api/src/models.rs             |   3 +-
 libs/remote_storage/src/local_fs.rs           |  34 +-
 pageserver/src/deletion_queue/list_writer.rs  |  11 +
 scripts/export_import_between_pageservers.py  |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  53 ++-
 test_runner/fixtures/pageserver/http.py       |  26 +-
 test_runner/fixtures/pageserver/types.py      |   7 +-
 test_runner/fixtures/remote_storage.py        |  44 ++-
 .../regress/test_attach_tenant_config.py      |  18 +-
 test_runner/regress/test_auth.py              |   4 +-
 test_runner/regress/test_branching.py         |  11 +-
 test_runner/regress/test_broken_timeline.py   |   2 +-
 test_runner/regress/test_change_pageserver.py |  13 +
 test_runner/regress/test_duplicate_layers.py  |   7 +-
 test_runner/regress/test_import.py            |   6 +-
 .../regress/test_layers_from_future.py        |  32 +-
 test_runner/regress/test_neon_cli.py          |   4 +
 test_runner/regress/test_pageserver_api.py    |  12 +-
 test_runner/regress/test_remote_storage.py    |  15 +-
 test_runner/regress/test_tenant_conf.py       |   2 +-
 test_runner/regress/test_tenant_delete.py     |   4 +-
 test_runner/regress/test_tenant_detach.py     |  46 ++-
 test_runner/regress/test_tenant_relocation.py | 348 +++++++-----------
 .../test_tenants_with_remote_storage.py       |   4 +-
 test_runner/regress/test_timeline_delete.py   |   4 +-
 test_runner/regress/test_wal_acceptor.py      |  14 +-
 27 files changed, 424 insertions(+), 317 deletions(-)

diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs
index be7cff352c..e50c8fbba0 100644
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -201,6 +201,12 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
         // TODO(sharding): make this shard-aware
         if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
             let valid = tenant_state.generation == req_tenant.gen;
+            tracing::info!(
+                "handle_validate: {}(gen {}): valid={valid} (latest {})",
+                req_tenant.id,
+                req_tenant.gen,
+                tenant_state.generation
+            );
             response.tenants.push(ValidateResponseTenant {
                 id: req_tenant.id,
                 valid,
@@ -250,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     tenant_state.pageserver = attach_req.node_id;
     let generation = tenant_state.generation;
 
+    tracing::info!(
+        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
+        attach_req.tenant_id,
+        tenant_state.generation,
+        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+    );
+
     locked.save().await.map_err(ApiError::InternalServerError)?;
 
     json_response(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 2234a06501..495a58e865 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -323,6 +323,7 @@ impl TenantConfigRequest {
 
 #[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
+    #[serde(default)]
     pub config: TenantAttachConfig,
     #[serde(default)]
     pub generation: Option<u32>,
@@ -330,7 +331,7 @@ pub struct TenantAttachRequest {
 
 /// Newtype to enforce deny_unknown_fields on TenantConfig for
 /// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Default)]
 #[serde(deny_unknown_fields)]
 pub struct TenantAttachConfig {
     #[serde(flatten)]
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 1be50ce565..fccc78de20 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -99,27 +99,35 @@ impl LocalFs {
         };
 
         // If we were given a directory, we may use it as our starting point.
-        // Otherwise, we must go up to the parent directory.  This is because
+        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
         // S3 object list prefixes can be arbitrary strings, but when reading
         // the local filesystem we need a directory to start calling read_dir on.
         let mut initial_dir = full_path.clone();
-        match fs::metadata(full_path.clone()).await {
-            Ok(meta) => {
-                if !meta.is_dir() {
+        loop {
+            // Did we make it to the root?
+            if initial_dir.parent().is_none() {
+                anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
+            }
+
+            match fs::metadata(initial_dir.clone()).await {
+                Ok(meta) if meta.is_dir() => {
+                    // We found a directory, break
+                    break;
+                }
+                Ok(_meta) => {
                     // It's not a directory: strip back to the parent
                     initial_dir.pop();
                 }
-            }
-            Err(e) if e.kind() == ErrorKind::NotFound => {
-                // It's not a file that exists: strip the prefix back to the parent directory
-                initial_dir.pop();
-            }
-            Err(e) => {
-                // Unexpected I/O error
-                anyhow::bail!(e)
+                Err(e) if e.kind() == ErrorKind::NotFound => {
+                    // It's not a file that exists: strip the prefix back to the parent directory
+                    initial_dir.pop();
+                }
+                Err(e) => {
+                    // Unexpected I/O error
+                    anyhow::bail!(e)
+                }
             }
         }
-
         // Note that Utf8PathBuf starts_with only considers full path segments, but
         // object prefixes are arbitrary strings, so we need the strings for doing
         // starts_with later.
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
index 7ff27ceb44..3a3d600ac2 100644
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -312,7 +312,18 @@ impl ListWriter {
                 for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
                     if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
                         if attached_gen.previous() == tenant_list.generation {
+                            info!(
+                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
+                                shard_id=%tenant_shard_id.shard_slug(),
+                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
+                                "Updating gen on recovered list");
                             tenant_list.generation = *attached_gen;
+                        } else {
+                            info!(
+                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
+                                shard_id=%tenant_shard_id.shard_slug(),
+                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
+                                "Encountered stale generation on recovered list");
                         }
                     }
                 }
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 77e4310eac..ff584bd4b0 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -266,9 +266,7 @@ class NeonPageserverHttpClient(requests.Session):
     def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
         res = self.post(
             f"http://{self.host}:{self.port}/v1/tenant",
-            json={
-                "new_tenant_id": new_tenant_id.hex,
-            },
+            json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
         )
 
         if res.status_code == 409:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9545dc2dd5..617d4806b2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -455,7 +455,7 @@ class NeonEnvBuilder:
         self.preserve_database_files = preserve_database_files
         self.initial_tenant = initial_tenant or TenantId.generate()
         self.initial_timeline = initial_timeline or TimelineId.generate()
-        self.enable_generations = False
+        self.enable_generations = True
         self.scrub_on_exit = False
         self.test_output_dir = test_output_dir
 
@@ -1571,6 +1571,20 @@ class NeonAttachmentService:
         )
         response.raise_for_status()
 
+    def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]:
+        response = requests.post(
+            f"{self.env.control_plane_api}/inspect",
+            json={"tenant_id": str(tenant_id)},
+        )
+        response.raise_for_status()
+        json = response.json()
+        log.info(f"Response: {json}")
+        if json["attachment"]:
+            # Explicit int() to make python type linter happy
+            return (int(json["attachment"][0]), int(json["attachment"][1]))
+        else:
+            return None
+
     def __enter__(self) -> "NeonAttachmentService":
         return self
 
@@ -1769,13 +1783,10 @@ class NeonPageserver(PgProtocol):
         Tenant attachment passes through here to acquire a generation number before proceeding
         to call into the pageserver HTTP client.
         """
-        if self.env.attachment_service is not None:
-            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
-        else:
-            generation = None
-
         client = self.http_client()
-        return client.tenant_attach(tenant_id, config, config_null, generation=generation)
+        return client.tenant_attach(
+            tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
+        )
 
     def tenant_detach(self, tenant_id: TenantId):
         if self.env.attachment_service is not None:
@@ -1784,6 +1795,34 @@ class NeonPageserver(PgProtocol):
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
+    def tenant_create(
+        self,
+        tenant_id: TenantId,
+        conf: Optional[Dict[str, Any]] = None,
+        auth_token: Optional[str] = None,
+    ) -> TenantId:
+        client = self.http_client(auth_token=auth_token)
+        return client.tenant_create(
+            tenant_id, conf, generation=self.maybe_get_generation(tenant_id)
+        )
+
+    def tenant_load(self, tenant_id: TenantId):
+        client = self.http_client()
+        return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
+
+    def maybe_get_generation(self, tenant_id: TenantId):
+        """
+        For tests that would like to use an HTTP client directly instead of using
+        the `tenant_attach` and `tenant_create` helpers here: issue a generation
+        number for a tenant.
+
+        Returns None if the attachment service is not enabled (legacy mode)
+        """
+        if self.env.attachment_service is not None:
+            return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+        else:
+            return None
+
 
 def append_pageserver_param_overrides(
     params_to_update: List[str],
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index eccab5fb6a..3e75bac424 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -210,16 +210,25 @@ class PageserverHttpClient(requests.Session):
         return res_json
 
     def tenant_create(
-        self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
+        self,
+        new_tenant_id: TenantId,
+        conf: Optional[Dict[str, Any]] = None,
+        generation: Optional[int] = None,
     ) -> TenantId:
         if conf is not None:
             assert "new_tenant_id" not in conf.keys()
+
+        body: Dict[str, Any] = {
+            "new_tenant_id": str(new_tenant_id),
+            **(conf or {}),
+        }
+
+        if generation is not None:
+            body.update({"generation": generation})
+
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant",
-            json={
-                "new_tenant_id": str(new_tenant_id),
-                **(conf or {}),
-            },
+            json=body,
         )
         self.verbose_error(res)
         if res.status_code == 409:
@@ -273,8 +282,11 @@ class PageserverHttpClient(requests.Session):
         self.verbose_error(res)
         return res
 
-    def tenant_load(self, tenant_id: TenantId):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
+    def tenant_load(self, tenant_id: TenantId, generation=None):
+        body = None
+        if generation is not None:
+            body = {"generation": generation}
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
         self.verbose_error(res)
 
     def tenant_ignore(self, tenant_id: TenantId):
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py
index 30e3f527bf..b3c1174b35 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -6,9 +6,8 @@ from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
 
 @dataclass
 class IndexLayerMetadata:
-    @classmethod
-    def from_json(cls, d: Dict[str, Any]):
-        return {}
+    file_size: int
+    generation: int
 
 
 @dataclass(frozen=True)
@@ -139,7 +138,7 @@ class IndexPartDump:
     def from_json(cls, d: Dict[str, Any]) -> "IndexPartDump":
         return IndexPartDump(
             layer_metadata={
-                parse_layer_file_name(n): IndexLayerMetadata.from_json(v)
+                parse_layer_file_name(n): IndexLayerMetadata(v["file_size"], v["generation"])
                 for n, v in d["layer_metadata"].items()
             },
             disk_consistent_lsn=Lsn(d["disk_consistent_lsn"]),
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 954c3142a3..d8361133d7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,7 +12,6 @@ import boto3
 from mypy_boto3_s3 import S3Client
 
 from fixtures.log_helper import log
-from fixtures.pageserver.types import LayerFileName
 from fixtures.types import TenantId, TimelineId
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
@@ -88,13 +87,46 @@ class LocalFsStorage:
     def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
         return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
 
-    def layer_path(
-        self, tenant_id: TenantId, timeline_id: TimelineId, layer_file_name: LayerFileName
-    ):
-        return self.timeline_path(tenant_id, timeline_id) / layer_file_name.to_str()
+    def timeline_latest_generation(self, tenant_id, timeline_id):
+        timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id))
+        index_parts = [f for f in timeline_files if f.startswith("index_part")]
+
+        def parse_gen(filename):
+            log.info(f"parsing index_part '{filename}'")
+            parts = filename.split("-")
+            if len(parts) == 2:
+                return int(parts[1], 16)
+            else:
+                return None
+
+        generations = sorted([parse_gen(f) for f in index_parts])
+        if len(generations) == 0:
+            raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}")
+        return generations[-1]
 
     def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
-        return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
+        latest_gen = self.timeline_latest_generation(tenant_id, timeline_id)
+        if latest_gen is None:
+            filename = TIMELINE_INDEX_PART_FILE_NAME
+        else:
+            filename = f"{TIMELINE_INDEX_PART_FILE_NAME}-{latest_gen:08x}"
+
+        return self.timeline_path(tenant_id, timeline_id) / filename
+
+    def remote_layer_path(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        local_name: str,
+        generation: Optional[int] = None,
+    ):
+        if generation is None:
+            generation = self.timeline_latest_generation(tenant_id, timeline_id)
+
+        assert generation is not None, "Cannot calculate remote layer path without generation"
+
+        filename = f"{local_name}-{generation:08x}"
+        return self.timeline_path(tenant_id, timeline_id) / filename
 
     def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
         with self.index_path(tenant_id, timeline_id).open("r") as f:
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 10cffb1d6c..70d386a566 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -100,7 +100,6 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
 
     env = negative_env.neon_env
     tenant_id = negative_env.tenant_id
-    ps_http = env.pageserver.http_client()
 
     config_with_unknown_keys = {
         "compaction_period": "1h",
@@ -108,16 +107,16 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
     }
 
     with pytest.raises(PageserverApiException) as e:
-        ps_http.tenant_attach(tenant_id, config=config_with_unknown_keys)
+        env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys)
     assert e.type == PageserverApiException
     assert e.value.status_code == 400
 
 
 @pytest.mark.parametrize("content_type", [None, "application/json"])
-def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
+def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     """
-    For backwards-compatibility: if we send an empty body,
-    the request should be accepted and the config should be the default config.
+    When the 'config' body attribute is omitted, the request should be accepted
+    and the tenant should use the default configuration
     """
     env = positive_env
     ps_http = env.pageserver.http_client()
@@ -128,9 +127,14 @@ def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
+    body = {}
+    gen = env.pageserver.maybe_get_generation(tenant_id)
+    if gen is not None:
+        body["generation"] = gen
+
     ps_http.post(
         f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
-        data=b"",
+        json=body,
         headers=None if content_type else {"Content-Type": "application/json"},
     ).raise_for_status()
 
@@ -191,7 +195,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
     }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
 
     ps_http.tenant_detach(tenant_id)
-    ps_http.tenant_attach(tenant_id, config=fully_custom_config)
+    env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
 
     assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
     assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index f729bdee98..7487106c44 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -60,14 +60,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
         assert_client_authorized(env, invalid_tenant_http_client)
 
     # create tenant using management token
-    pageserver_http_client.tenant_create(TenantId.generate())
+    env.pageserver.tenant_create(TenantId.generate(), auth_token=pageserver_token)
 
     # fail to create tenant using tenant token
     with pytest.raises(
         PageserverApiException,
         match="Forbidden: JWT authentication error",
     ):
-        tenant_http_client.tenant_create(TenantId.generate())
+        env.pageserver.tenant_create(TenantId.generate(), auth_token=tenant_token)
 
 
 def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index c4f743204e..a908dd713a 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -158,7 +158,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
     # pause all uploads
     ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
 
     initial_branch = "initial_branch"
 
@@ -200,7 +200,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
     # pause all uploads
     ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
 
     def start_creating_timeline():
         with pytest.raises(RequestException):
@@ -257,7 +257,7 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
 
     # pause all uploads
     ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
 
     def start_creating_timeline():
         ps_http.timeline_create(
@@ -343,8 +343,7 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
     )
     ps_http = env.pageserver.http_client()
 
-    # pause all uploads
-    ps_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
 
     # Create a timeline whose creation will succeed.  The tenant will need at least one
     # timeline to be loadable.
@@ -397,7 +396,7 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
     )
     ps_http = env.pageserver.http_client()
 
-    ps_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
     ps_http.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
 
     # pause all uploads
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 84a322039a..53eeb8bbe9 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -160,7 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
         ]
     )
 
-    pageserver_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
     tenant_id = env.initial_tenant
 
     timelines_dir = env.pageserver.timeline_dir(tenant_id)
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 410bf03c2b..1b6c982850 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -14,6 +14,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start()
 
+    for pageserver in env.pageservers:
+        # This test dual-attaches a tenant, one of the pageservers will therefore
+        # be running with a stale generation.
+        pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
     env.neon_cli.create_branch("test_change_pageserver")
     endpoint = env.endpoints.create_start("test_change_pageserver")
 
@@ -79,6 +84,11 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     # Try failing back, and this time we will stop the current pageserver before reconfiguring
     # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
     # is more like what happens in an unexpected  pageserver failure.
+    #
+    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # about to start as the attached pageserver
+    assert env.attachment_service is not None
+    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
 
@@ -88,6 +98,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     assert fetchone() == (100000,)
 
     env.pageservers[0].stop()
+    # Since we're dual-attached, need to tip-off attachment service to treat the one we're
+    # about to start as the attached pageserver
+    env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id)
     env.pageservers[1].start()
 
     # Test a (former) bug where a child process spins without updating its connection string
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index bcf99cae7c..224e6f50c7 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -112,7 +112,9 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
 
     assert l1_found is not None, "failed to find L1 locally"
 
-    uploaded = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
+    uploaded = env.pageserver_remote_storage.remote_layer_path(
+        tenant_id, timeline_id, l1_found.name
+    )
     assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
 
     env.pageserver.start()
@@ -139,4 +141,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
 
     wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
 
+    uploaded = env.pageserver_remote_storage.remote_layer_path(
+        tenant_id, timeline_id, l1_found.name
+    )
     assert uploaded.exists(), "the L1 is uploaded"
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 8da5f1eec2..920e8d0b72 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -84,8 +84,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
-    client = env.pageserver.http_client()
-    client.tenant_create(tenant)
+    env.pageserver.tenant_create(tenant)
 
     env.pageserver.allowed_errors.extend(
         [
@@ -149,6 +148,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
     )
 
+    client = env.pageserver.http_client()
     timeline_delete_wait_completed(client, tenant, timeline)
 
     # Importing correct backup works
@@ -292,7 +292,7 @@ def _import(
     # Import to pageserver
     endpoint_id = "ep-import_from_pageserver"
     client = env.pageserver.http_client()
-    client.tenant_create(tenant)
+    env.pageserver.tenant_create(tenant)
     env.neon_cli.raw_cli(
         [
             "timeline",
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 7ce0bdaeba..ef2b2185c3 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -149,19 +149,28 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
         f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
     )
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
-    future_layer_path = env.pageserver_remote_storage.layer_path(
-        tenant_id, timeline_id, future_layer
+    future_layer_path = env.pageserver_remote_storage.remote_layer_path(
+        tenant_id, timeline_id, future_layer.to_str()
     )
     log.info(f"future layer path: {future_layer_path}")
     pre_stat = future_layer_path.stat()
     time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
 
+    def get_generation_number():
+        assert env.attachment_service is not None
+        attachment = env.attachment_service.inspect(tenant_id)
+        assert attachment is not None
+        return attachment[0]
+
     # force removal of layers from the future
     tenant_conf = ps_http.tenant_config(tenant_id)
-    ps_http.tenant_detach(tenant_id)
+    generation_before_detach = get_generation_number()
+    env.pageserver.tenant_detach(tenant_id)
     failpoint_name = "before-delete-layer-pausable"
+
     ps_http.configure_failpoints((failpoint_name, "pause"))
-    ps_http.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
+    env.pageserver.tenant_attach(tenant_id, tenant_conf.tenant_specific_overrides)
+    generation_after_reattach = get_generation_number()
     wait_until_tenant_active(ps_http, tenant_id)
 
     # Ensure the IndexPart upload that unlinks the layer file finishes, i.e., doesn't clog the queue.
@@ -177,6 +186,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
         assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
 
     wait_until(10, 0.5, delete_at_pause_point)
+    future_layer_path = env.pageserver_remote_storage.remote_layer_path(
+        tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
+    )
+    log.info(f"future layer path: {future_layer_path}")
     assert future_layer_path.exists()
 
     # wait for re-ingestion of the WAL from safekeepers into the in-memory layer
@@ -215,12 +228,17 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     # Examine the resulting S3 state.
     log.info("integrity-check the remote storage")
     ip = get_index_part()
-    for layer_file_name in ip.layer_metadata.keys():
-        layer_path = env.pageserver_remote_storage.layer_path(
-            tenant_id, timeline_id, layer_file_name
+    for layer_file_name, layer_metadata in ip.layer_metadata.items():
+        log.info(f"Layer metadata {layer_file_name.to_str()}: {layer_metadata}")
+        layer_path = env.pageserver_remote_storage.remote_layer_path(
+            tenant_id, timeline_id, layer_file_name.to_str(), layer_metadata.generation
         )
         assert layer_path.exists(), f"{layer_file_name.to_str()}"
 
     log.info("assert that the overwritten layer won")
+    future_layer_path = env.pageserver_remote_storage.remote_layer_path(
+        tenant_id, timeline_id, future_layer.to_str(), generation=generation_after_reattach
+    )
     final_stat = future_layer_path.stat()
+    log.info(f"future layer path: {future_layer_path}")
     assert final_stat.st_mtime != pre_stat.st_mtime
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index de18ea0e6b..16d120e24a 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -133,6 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     # Stop default ps/sk
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
+    env.neon_cli.attachment_service_stop(False)
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
     env.pageserver.running = False
@@ -173,6 +174,9 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1)
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
 
+    # Stop this to get out of the way of the following `start`
+    env.neon_cli.attachment_service_stop(False)
+
     # Default start
     res = env.neon_cli.raw_cli(["start"])
     res.check_returncode()
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 2d83788193..64e41a2dd5 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -8,7 +8,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
@@ -62,7 +61,10 @@ def test_pageserver_init_node_id(
     assert "has node id already, it cannot be overridden" in bad_update.stderr
 
 
-def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_tenant: TenantId):
+def check_client(env: NeonEnv, client: PageserverHttpClient):
+    pg_version = env.pg_version
+    initial_tenant = env.initial_tenant
+
     client.check_status()
 
     # check initial tenant is there
@@ -70,7 +72,7 @@ def check_client(pg_version: PgVersion, client: PageserverHttpClient, initial_te
 
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
-    client.tenant_create(tenant_id)
+    client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
     timelines = client.timeline_list(tenant_id)
@@ -181,7 +183,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
 def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
     env = neon_simple_env
     with env.pageserver.http_client() as client:
-        check_client(env.pg_version, client, env.initial_tenant)
+        check_client(env, client)
 
 
 def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder):
@@ -191,4 +193,4 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
     pageserver_token = env.auth_keys.generate_pageserver_token()
 
     with env.pageserver.http_client(auth_token=pageserver_token) as client:
-        check_client(env.pg_version, client, env.initial_tenant)
+        check_client(env, client)
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 67daf5f901..0a5046e219 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -23,7 +23,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import (
-    TIMELINE_INDEX_PART_FILE_NAME,
     LocalFsStorage,
     RemoteStorageKind,
     available_remote_storages,
@@ -350,6 +349,13 @@ def test_remote_storage_upload_queue_retries(
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 
+    # We are about to forcibly drop local dirs.  Attachment service will increment generation in re-attach before
+    # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
+    # these warnings if there was a durable but un-executed deletion list at time of restart.
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
+
     dir_to_clear = env.pageserver.tenant_dir()
     shutil.rmtree(dir_to_clear)
     os.mkdir(dir_to_clear)
@@ -648,7 +654,7 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder):
     ), f"Expected to have an initial timeline and the branch timeline only, but got {timelines_before_detach}"
 
     client.tenant_detach(env.initial_tenant)
-    client.tenant_attach(env.initial_tenant)
+    env.pageserver.tenant_attach(env.initial_tenant)
     wait_until_tenant_state(client, env.initial_tenant, "Active", 5)
 
     timelines_after_detach = set(
@@ -758,10 +764,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         # this is because creating a timeline always awaits for the uploads to complete
         assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
 
-        assert (
-            new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
+        assert env.pageserver_remote_storage.index_path(
+            env.initial_tenant, new_branch_timeline_id
         ).is_file(), "uploads scheduled during initial load should had been awaited for"
     finally:
+        barrier.abort()
         create_thread.join()
 
 
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index fcc3243e81..f4565c2ee2 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -314,7 +314,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     assert not config_path.exists(), "detach did not remove config file"
 
-    http_client.tenant_attach(tenant_id)
+    env.pageserver.tenant_attach(tenant_id)
     wait_until(
         number_of_iterations=5,
         interval=1,
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 89c474286a..3929f59e46 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -380,7 +380,7 @@ def test_tenant_delete_is_resumed_on_attach(
     env.pageserver.start()
 
     # now we call attach
-    ps_http.tenant_attach(tenant_id=tenant_id)
+    env.pageserver.tenant_attach(tenant_id=tenant_id)
 
     # delete should be resumed
     wait_tenant_status_404(ps_http, tenant_id, iterations)
@@ -419,7 +419,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
         f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
     )
 
-    pageserver_http.tenant_create(env.initial_tenant)
+    env.pageserver.tenant_create(env.initial_tenant)
 
     failpoint = "flush-layer-cancel-after-writing-layer-out-pausable"
     pageserver_http.configure_failpoints((failpoint, "pause"))
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index df497c0f7b..5c57fb233b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -82,6 +82,10 @@ def test_tenant_reattach(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
+    # Our re-attach may race with the deletion queue processing LSN updates
+    # from the original attachment.
+    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
@@ -112,8 +116,8 @@ def test_tenant_reattach(
 
     if mode == ReattachMode.REATTACH_EXPLICIT:
         # Explicitly detach then attach the tenant as two separate API calls
-        pageserver_http.tenant_detach(tenant_id)
-        pageserver_http.tenant_attach(tenant_id)
+        env.pageserver.tenant_detach(tenant_id)
+        env.pageserver.tenant_attach(tenant_id)
     elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
         # Use the reset API to detach/attach in one shot
         pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
@@ -192,6 +196,9 @@ def test_tenant_reattach_while_busy(
     updates_finished = 0
     updates_to_perform = 0
 
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    env = neon_env_builder.init_start()
+
     # Run random UPDATEs on test table. On failure, try again.
     async def update_table(pg_conn: asyncpg.Connection):
         nonlocal updates_started, updates_finished, updates_to_perform
@@ -223,7 +230,7 @@ def test_tenant_reattach_while_busy(
         pageserver_http.tenant_detach(tenant_id)
         await asyncio.sleep(1)
         log.info("Re-attaching tenant")
-        pageserver_http.tenant_attach(tenant_id)
+        env.pageserver.tenant_attach(tenant_id)
         log.info("Re-attach finished")
 
         # Continue with 5000 more updates
@@ -244,9 +251,6 @@ def test_tenant_reattach_while_busy(
 
         assert updates_finished == updates_to_perform
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start()
-
     pageserver_http = env.pageserver.http_client()
 
     # create new nenant
@@ -454,6 +458,10 @@ def test_detach_while_attaching(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
+    # Our re-attach may race with the deletion queue processing LSN updates
+    # from the original attachment.
+    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
@@ -487,7 +495,7 @@ def test_detach_while_attaching(
     # And re-attach
     pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
 
-    pageserver_http.tenant_attach(tenant_id)
+    env.pageserver.tenant_attach(tenant_id)
 
     # Before it has chance to finish, detach it again
     pageserver_http.tenant_detach(tenant_id)
@@ -497,7 +505,7 @@ def test_detach_while_attaching(
 
     # Attach it again. If the GC and compaction loops from the previous attach/detach
     # cycle are still running, things could get really confusing..
-    pageserver_http.tenant_attach(tenant_id)
+    env.pageserver.tenant_attach(tenant_id)
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT COUNT(*) FROM foo")
@@ -556,7 +564,7 @@ def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
     ), "Ignored tenant should not be reloaded after pageserver restart"
 
     # now, load it from the local files and expect it works
-    pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
+    env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
     wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
 
     tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
@@ -611,7 +619,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
     assert layers_removed, f"Found no layers for tenant {timeline_dir}"
 
     # now, load it from the local files and expect it to work due to remote storage restoration
-    pageserver_http.tenant_load(tenant_id=tenant_id)
+    env.pageserver.tenant_load(tenant_id=tenant_id)
     wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 
     tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
@@ -645,13 +653,13 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
         expected_exception=PageserverApiException,
         match=f"tenant {tenant_id} already exists, state: Active",
     ):
-        pageserver_http.tenant_load(tenant_id)
+        env.pageserver.tenant_load(tenant_id)
 
     with pytest.raises(
         expected_exception=PageserverApiException,
         match=f"tenant {tenant_id} already exists, state: Active",
     ):
-        pageserver_http.tenant_attach(tenant_id)
+        env.pageserver.tenant_attach(tenant_id)
 
     pageserver_http.tenant_ignore(tenant_id)
 
@@ -660,7 +668,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
         expected_exception=PageserverApiException,
         match="tenant directory already exists",
     ):
-        pageserver_http.tenant_attach(tenant_id)
+        env.pageserver.tenant_attach(tenant_id)
 
 
 def test_ignore_while_attaching(
@@ -679,6 +687,10 @@ def test_ignore_while_attaching(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
+    # Our re-attach may race with the deletion queue processing LSN updates
+    # from the original attachment.
+    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
     data_id = 1
     data_secret = "very secret secret"
     insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
@@ -689,7 +701,7 @@ def test_ignore_while_attaching(
     pageserver_http.tenant_detach(tenant_id)
     # And re-attach, but stop attach task_mgr task from completing
     pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
-    pageserver_http.tenant_attach(tenant_id)
+    env.pageserver.tenant_attach(tenant_id)
     # Run ignore on the task, thereby cancelling the attach.
     # XXX This should take priority over attach, i.e., it should cancel the attach task.
     # But neither the failpoint, nor the proper remote_timeline_client download functions,
@@ -704,7 +716,7 @@ def test_ignore_while_attaching(
         expected_exception=PageserverApiException,
         match="tenant directory already exists",
     ):
-        pageserver_http.tenant_attach(tenant_id)
+        env.pageserver.tenant_attach(tenant_id)
 
     tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
     assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
@@ -714,7 +726,7 @@ def test_ignore_while_attaching(
 
     # Calling load will bring the tenant back online
     pageserver_http.configure_failpoints([("attach-before-activate", "off")])
-    pageserver_http.tenant_load(tenant_id)
+    env.pageserver.tenant_load(tenant_id)
 
     wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 
@@ -818,7 +830,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         found_broken
     ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
 
-    client.tenant_load(env.initial_tenant)
+    env.pageserver.tenant_load(env.initial_tenant)
 
     found_active = False
     active, broken_set = ([], [])
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 8be0f0449b..c7ff80d675 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -7,13 +7,8 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
 import pytest
-from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    Endpoint,
-    NeonEnv,
-    NeonEnvBuilder,
-)
+from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
@@ -30,7 +25,6 @@ from fixtures.remote_storage import (
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
-    start_in_background,
     subprocess_capture,
     wait_until,
 )
@@ -40,58 +34,6 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
     assert abs(a - b) / a < margin_ratio, abs(a - b) / a
 
 
-@contextmanager
-def new_pageserver_service(
-    new_pageserver_dir: Path,
-    pageserver_bin: Path,
-    remote_storage_mock_path: Path,
-    pg_port: int,
-    http_port: int,
-    broker: Optional[NeonBroker],
-    pg_distrib_dir: Path,
-):
-    """
-    cannot use NeonPageserver yet because it depends on neon cli
-    which currently lacks support for multiple pageservers
-    """
-    # actually run new pageserver
-    cmd = [
-        str(pageserver_bin),
-        "--workdir",
-        str(new_pageserver_dir),
-        "--update-config",
-        f"-c listen_pg_addr='localhost:{pg_port}'",
-        f"-c listen_http_addr='localhost:{http_port}'",
-        f"-c pg_distrib_dir='{pg_distrib_dir}'",
-        "-c id=2",
-        f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}",
-    ]
-    if broker is not None:
-        cmd.append(
-            f"-c broker_endpoint='{broker.client_url()}'",
-        )
-    pageserver_client = PageserverHttpClient(
-        port=http_port,
-        auth_token=None,
-        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
-    )
-    try:
-        pageserver_process = start_in_background(
-            cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status
-        )
-    except Exception as e:
-        log.error(e)
-        pageserver_process.kill()
-        raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") from e
-
-    log.info("new pageserver started")
-    try:
-        yield pageserver_process
-    finally:
-        log.info("stopping new pageserver")
-        pageserver_process.kill()
-
-
 @contextmanager
 def pg_cur(endpoint):
     with closing(endpoint.connect()) as conn:
@@ -201,7 +143,7 @@ def check_timeline_attached(
 
 
 def switch_pg_to_new_pageserver(
-    env: NeonEnv,
+    origin_ps: NeonPageserver,
     endpoint: Endpoint,
     new_pageserver_port: int,
     tenant_id: TenantId,
@@ -216,7 +158,7 @@ def switch_pg_to_new_pageserver(
 
     endpoint.start()
 
-    timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id)
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
         "metadata" in files_before_detach
@@ -269,27 +211,32 @@ def test_tenant_relocation(
     with_load: str,
 ):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    neon_env_builder.num_pageservers = 2
 
     env = neon_env_builder.init_start()
 
     tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
 
     # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
+    env.pageservers[0].allowed_errors.append(
         ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
     )
 
-    # Needed for detach polling.
-    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+    # Needed for detach polling on the original pageserver
+    env.pageservers[0].allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+    # We will dual-attach in this test, so stale generations are expected
+    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates.*")
 
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
-    remote_storage_mock_path = env.pageserver_remote_storage.root
 
     # we use two branches to check that they are both relocated
     # first branch is used for load, compute for second one is used to
     # check that data is not lost
 
-    pageserver_http = env.pageserver.http_client()
+    origin_ps = env.pageservers[0]
+    destination_ps = env.pageservers[1]
+    origin_http = origin_ps.http_client()
+    destination_http = destination_ps.http_client()
 
     _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
     log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
@@ -302,7 +249,7 @@ def test_tenant_relocation(
     timeline_id_main, current_lsn_main = populate_branch(
         ep_main,
         tenant_id=tenant_id,
-        ps_http=pageserver_http,
+        ps_http=origin_http,
         create_table=True,
         expected_sum=500500,
     )
@@ -320,17 +267,17 @@ def test_tenant_relocation(
     timeline_id_second, current_lsn_second = populate_branch(
         ep_second,
         tenant_id=tenant_id,
-        ps_http=pageserver_http,
+        ps_http=origin_http,
         create_table=False,
         expected_sum=1001000,
     )
 
     # wait until pageserver receives that data
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main)
-    timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main)
+    wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_main, current_lsn_main)
+    timeline_detail_main = origin_http.timeline_detail(tenant_id, timeline_id_main)
 
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second)
-    timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second)
+    wait_for_last_record_lsn(origin_http, tenant_id, timeline_id_second, current_lsn_second)
+    timeline_detail_second = origin_http.timeline_detail(tenant_id, timeline_id_second)
 
     if with_load == "with_load":
         # create load table
@@ -350,170 +297,149 @@ def test_tenant_relocation(
     # if user creates a branch during migration
     # it wont appear on the new pageserver
     ensure_checkpoint(
-        pageserver_http=pageserver_http,
+        pageserver_http=origin_http,
         tenant_id=tenant_id,
         timeline_id=timeline_id_main,
         current_lsn=current_lsn_main,
     )
 
     ensure_checkpoint(
-        pageserver_http=pageserver_http,
+        pageserver_http=origin_http,
         tenant_id=tenant_id,
         timeline_id=timeline_id_second,
         current_lsn=current_lsn_second,
     )
 
-    log.info("inititalizing new pageserver")
-    # bootstrap second pageserver
-    new_pageserver_dir = env.repo_dir / "new_pageserver"
-    new_pageserver_dir.mkdir()
+    # Migrate either by attaching from s3 or import/export basebackup
+    if method == "major":
+        cmd = [
+            "poetry",
+            "run",
+            "python",
+            str(base_dir / "scripts/export_import_between_pageservers.py"),
+            "--tenant-id",
+            str(tenant_id),
+            "--from-host",
+            "localhost",
+            "--from-http-port",
+            str(origin_http.port),
+            "--from-pg-port",
+            str(origin_ps.service_port.pg),
+            "--to-host",
+            "localhost",
+            "--to-http-port",
+            str(destination_http.port),
+            "--to-pg-port",
+            str(destination_ps.service_port.pg),
+            "--pg-distrib-dir",
+            str(neon_env_builder.pg_distrib_dir),
+            "--work-dir",
+            str(test_output_dir),
+            "--tmp-pg-port",
+            str(port_distributor.get_port()),
+        ]
+        subprocess_capture(test_output_dir, cmd, check=True)
 
-    new_pageserver_pg_port = port_distributor.get_port()
-    new_pageserver_http_port = port_distributor.get_port()
-    log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
-    pageserver_bin = neon_binpath / "pageserver"
+        destination_ps.allowed_errors.append(
+            ".*ignored .* unexpected bytes after the tar archive.*"
+        )
+    elif method == "minor":
+        # call to attach timeline to new pageserver
+        destination_ps.tenant_attach(tenant_id)
 
-    new_pageserver_http = PageserverHttpClient(
-        port=new_pageserver_http_port,
-        auth_token=None,
-        is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
-    )
+        # wait for tenant to finish attaching
+        wait_until(
+            number_of_iterations=10,
+            interval=1,
+            func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
+        )
 
-    with new_pageserver_service(
-        new_pageserver_dir,
-        pageserver_bin,
-        remote_storage_mock_path,
-        new_pageserver_pg_port,
-        new_pageserver_http_port,
-        neon_env_builder.broker,
-        neon_env_builder.pg_distrib_dir,
-    ):
-        # Migrate either by attaching from s3 or import/export basebackup
-        if method == "major":
-            cmd = [
-                "poetry",
-                "run",
-                "python",
-                str(base_dir / "scripts/export_import_between_pageservers.py"),
-                "--tenant-id",
-                str(tenant_id),
-                "--from-host",
-                "localhost",
-                "--from-http-port",
-                str(pageserver_http.port),
-                "--from-pg-port",
-                str(env.pageserver.service_port.pg),
-                "--to-host",
-                "localhost",
-                "--to-http-port",
-                str(new_pageserver_http_port),
-                "--to-pg-port",
-                str(new_pageserver_pg_port),
-                "--pg-distrib-dir",
-                str(neon_env_builder.pg_distrib_dir),
-                "--work-dir",
-                str(test_output_dir),
-                "--tmp-pg-port",
-                str(port_distributor.get_port()),
-            ]
-            subprocess_capture(test_output_dir, cmd, check=True)
-        elif method == "minor":
-            # call to attach timeline to new pageserver
-            new_pageserver_http.tenant_attach(tenant_id)
-
-            # wait for tenant to finish attaching
-            wait_until(
-                number_of_iterations=10,
-                interval=1,
-                func=lambda: assert_tenant_state(new_pageserver_http, tenant_id, "Active"),
-            )
-
-            check_timeline_attached(
-                new_pageserver_http,
-                tenant_id,
-                timeline_id_main,
-                timeline_detail_main,
-                current_lsn_main,
-            )
-
-            check_timeline_attached(
-                new_pageserver_http,
-                tenant_id,
-                timeline_id_second,
-                timeline_detail_second,
-                current_lsn_second,
-            )
-
-        # rewrite neon cli config to use new pageserver for basebackup to start new compute
-        lines = (env.repo_dir / "config").read_text().splitlines()
-        for i, line in enumerate(lines):
-            if line.startswith("listen_http_addr"):
-                lines[i] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'"
-            if line.startswith("listen_pg_addr"):
-                lines[i] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'"
-        (env.repo_dir / "config").write_text("\n".join(lines))
-
-        old_local_path_main = switch_pg_to_new_pageserver(
-            env,
-            ep_main,
-            new_pageserver_pg_port,
+        check_timeline_attached(
+            destination_http,
             tenant_id,
             timeline_id_main,
+            timeline_detail_main,
+            current_lsn_main,
         )
 
-        old_local_path_second = switch_pg_to_new_pageserver(
-            env,
-            ep_second,
-            new_pageserver_pg_port,
+        check_timeline_attached(
+            destination_http,
             tenant_id,
             timeline_id_second,
+            timeline_detail_second,
+            current_lsn_second,
         )
 
-        # detach tenant from old pageserver before we check
-        # that all the data is there to be sure that old pageserver
-        # is no longer involved, and if it is, we will see the error
-        pageserver_http.tenant_detach(tenant_id)
+    # rewrite neon cli config to use new pageserver for basebackup to start new compute
+    lines = (env.repo_dir / "config").read_text().splitlines()
+    for i, line in enumerate(lines):
+        if line.startswith("listen_http_addr"):
+            lines[i] = f"listen_http_addr = 'localhost:{destination_http.port}'"
+        if line.startswith("listen_pg_addr"):
+            lines[i] = f"listen_pg_addr = 'localhost:{destination_ps.service_port.pg}'"
+    (env.repo_dir / "config").write_text("\n".join(lines))
 
-        # Wait a little, so that the detach operation has time to finish.
-        wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
+    old_local_path_main = switch_pg_to_new_pageserver(
+        origin_ps,
+        ep_main,
+        destination_ps.service_port.pg,
+        tenant_id,
+        timeline_id_main,
+    )
 
-        post_migration_check(ep_main, 500500, old_local_path_main)
-        post_migration_check(ep_second, 1001000, old_local_path_second)
+    old_local_path_second = switch_pg_to_new_pageserver(
+        origin_ps,
+        ep_second,
+        destination_ps.service_port.pg,
+        tenant_id,
+        timeline_id_second,
+    )
 
-        # ensure that we can successfully read all relations on the new pageserver
-        with pg_cur(ep_second) as cur:
-            cur.execute(
-                """
-                DO $$
-                DECLARE
-                r RECORD;
-                BEGIN
-                FOR r IN
-                SELECT relname FROM pg_class WHERE relkind='r'
-                LOOP
-                    RAISE NOTICE '%', r.relname;
-                    EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
-                END LOOP;
-                END$$;
-                """
-            )
+    # detach tenant from old pageserver before we check
+    # that all the data is there to be sure that old pageserver
+    # is no longer involved, and if it is, we will see the error
+    origin_http.tenant_detach(tenant_id)
 
-        if with_load == "with_load":
-            assert load_ok_event.wait(3)
-            log.info("stopping load thread")
-            load_stop_event.set()
-            load_thread.join(timeout=10)
-            log.info("load thread stopped")
+    # Wait a little, so that the detach operation has time to finish.
+    wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
 
-        # bring old pageserver back for clean shutdown via neon cli
-        # new pageserver will be shut down by the context manager
-        lines = (env.repo_dir / "config").read_text().splitlines()
-        for i, line in enumerate(lines):
-            if line.startswith("listen_http_addr"):
-                lines[i] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'"
-            if line.startswith("listen_pg_addr"):
-                lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
-        (env.repo_dir / "config").write_text("\n".join(lines))
+    post_migration_check(ep_main, 500500, old_local_path_main)
+    post_migration_check(ep_second, 1001000, old_local_path_second)
+
+    # ensure that we can successfully read all relations on the new pageserver
+    with pg_cur(ep_second) as cur:
+        cur.execute(
+            """
+            DO $$
+            DECLARE
+            r RECORD;
+            BEGIN
+            FOR r IN
+            SELECT relname FROM pg_class WHERE relkind='r'
+            LOOP
+                RAISE NOTICE '%', r.relname;
+                EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname;
+            END LOOP;
+            END$$;
+            """
+        )
+
+    if with_load == "with_load":
+        assert load_ok_event.wait(3)
+        log.info("stopping load thread")
+        load_stop_event.set()
+        load_thread.join(timeout=10)
+        log.info("load thread stopped")
+
+    # bring old pageserver back for clean shutdown via neon cli
+    # new pageserver will be shut down by the context manager
+    lines = (env.repo_dir / "config").read_text().splitlines()
+    for i, line in enumerate(lines):
+        if line.startswith("listen_http_addr"):
+            lines[i] = f"listen_http_addr = 'localhost:{origin_ps.service_port.http}'"
+        if line.startswith("listen_pg_addr"):
+            lines[i] = f"listen_pg_addr = 'localhost:{origin_ps.service_port.pg}'"
+    (env.repo_dir / "config").write_text("\n".join(lines))
 
 
 # Simulate hard crash of pageserver and re-attach a tenant with a branch
@@ -571,7 +497,7 @@ def test_emergency_relocate_with_branches_slow_replay(
     # Attach and wait a few seconds to give it time to load the tenants, attach to the
     # safekeepers, and to stream and ingest the WAL up to the pause-point.
     before_attach_time = time.time()
-    pageserver_http.tenant_attach(tenant_id)
+    env.pageserver.tenant_attach(tenant_id)
     time.sleep(3)
 
     # The wal ingestion on the main timeline should now be paused at the fail point.
@@ -718,7 +644,7 @@ def test_emergency_relocate_with_branches_createdb(
     # ingest the WAL, but let's make this less dependent on accidental timing.
     pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
     before_attach_time = time.time()
-    pageserver_http.tenant_attach(tenant_id)
+    env.pageserver.tenant_attach(tenant_id)
 
     child_endpoint.start()
     with child_endpoint.cursor(dbname="neondb") as cur:
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 0169335a70..f181e70696 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -297,8 +297,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
     assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
 
     # the remote side of local_layer_truncated
-    remote_layer_path = (
-        env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / path.name
+    remote_layer_path = env.pageserver_remote_storage.remote_layer_path(
+        tenant_id, timeline_id, path.name
     )
 
     # if the upload ever was ongoing, this check would be racy, but at least one
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index b1a2755394..dab7b3879e 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -396,7 +396,7 @@ def test_timeline_resurrection_on_attach(
     ##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
     env.pageserver.start()
 
-    ps_http.tenant_attach(tenant_id=tenant_id)
+    env.pageserver.tenant_attach(tenant_id=tenant_id)
 
     wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
 
@@ -897,7 +897,7 @@ def test_timeline_delete_resumed_on_attach(
     env.pageserver.start()
 
     # now we call attach
-    ps_http.tenant_attach(tenant_id=tenant_id)
+    env.pageserver.tenant_attach(tenant_id=tenant_id)
 
     # delete should be resumed
     wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ad12b56874..b34d2de0ba 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -298,17 +298,21 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
 
     # and wait till remote_consistent_lsn propagates to all safekeepers
     #
-    # TODO: this executes long as timeline on safekeeper is immediately
-    # deactivated once rcl reaches pageserver one, and thus we generally wait
-    # till pageserver reconnects to all safekeepers one by one here. Timeline
-    # status on safekeeper should take into account peers state as well.
+    # This timeout is long: safekeepers learn about remote_consistent_lsn updates when a pageserver
+    # connects, receives a PrimaryKeepAlive, and sends a PageserverFeedback.  So the timeout has to encompass:
+    # - pageserver deletion_queue to validate + publish the remote_consistent_lsn
+    # - pageserver to reconnect to all safekeepers one by one, with multi-second delays between
+    #
+    # TODO: timeline status on safekeeper should take into account peers state as well.
+    rcl_propagate_secs = 60
+
     started_at = time.time()
     while True:
         stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
         if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
             break
         elapsed = time.time() - started_at
-        if elapsed > 30:
+        if elapsed > rcl_propagate_secs:
             raise RuntimeError(
                 f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
             )

From 880663f6bc308fbbb0ae61b8aa03ed664448736c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Dec 2023 14:52:16 +0000
Subject: [PATCH 045/202] tests: use tenant_create() helper in test_bulk_insert
 (#6064)

## Problem

Since #5449 we enable generations in tests by default. Running
benchmarks was missed while merging that PR, and there was one that
needed updating.

## Summary of changes

Make test_bulk_insert use the proper generation-aware helper for tenant
creation.
---
 test_runner/performance/test_bulk_insert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 46acec0f63..a146e011cc 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -57,7 +57,7 @@ def measure_recovery_time(env: NeonCompare):
     # when we "create" the Tenant again, we will replay the WAL from the beginning.
     client.tenant_delete(env.tenant)
     wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
-    client.tenant_create(new_tenant_id=env.tenant)
+    env.env.pageserver.tenant_create(tenant_id=env.tenant)
 
     # Measure recovery time
     with env.record_duration("wal_recovery"):

From b492cedf514b87de72c9e543cd87a68f3ddf9f66 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Dec 2023 17:52:22 +0200
Subject: [PATCH 046/202] fix(remote_storage): buffering, by using streams for
 upload and download (#5446)

There is double buffering in remote_storage and in pageserver for 8KiB
in using `tokio::io::copy` to read `BufReader<ReaderStream<_>>`.

Switches downloads and uploads to use `Stream<Item =
std::io::Result<Bytes>>`. Caller and only caller now handles setting up
buffering. For reading, `Stream<Item = ...>` is also a `AsyncBufRead`,
so when writing to a file, we now have `tokio::io::copy_buf` reading
full buffers and writing them to `tokio::io::BufWriter` which handles
the buffering before dispatching over to `tokio::fs::File`.

Additionally implements streaming uploads for azure. With azure
downloads are a bit nicer than before, but not much; instead of one huge
vec they just hold on to N allocations we got over the wire.

This PR will also make it trivial to switch reading and writing to
io-uring based methods.

Cc: #5563.
---
 Cargo.lock                                    |   3 +
 libs/remote_storage/Cargo.toml                |   3 +-
 libs/remote_storage/src/azure_blob.rs         | 183 ++++++++++++++++--
 libs/remote_storage/src/lib.rs                |  12 +-
 libs/remote_storage/src/local_fs.rs           | 120 ++++++------
 libs/remote_storage/src/s3_bucket.rs          | 112 +++++++----
 libs/remote_storage/src/simulate_failures.rs  |   4 +-
 libs/remote_storage/tests/test_real_azure.rs  |  99 ++++++----
 libs/remote_storage/tests/test_real_s3.rs     |  66 ++++---
 pageserver/src/tenant/delete.rs               |   4 +-
 .../tenant/remote_timeline_client/download.rs |  42 ++--
 .../tenant/remote_timeline_client/upload.rs   |  15 +-
 safekeeper/Cargo.toml                         |   1 +
 safekeeper/src/wal_backup.rs                  |  20 +-
 workspace_hack/Cargo.toml                     |   2 +-
 15 files changed, 468 insertions(+), 218 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f6630cc203..fcdc424636 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3864,6 +3864,7 @@ dependencies = [
  "bytes",
  "camino",
  "camino-tempfile",
+ "futures",
  "futures-util",
  "http-types",
  "hyper",
@@ -4291,6 +4292,7 @@ dependencies = [
  "tokio-io-timeout",
  "tokio-postgres",
  "tokio-stream",
+ "tokio-util",
  "toml_edit",
  "tracing",
  "url",
@@ -5220,6 +5222,7 @@ checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
  "bytes",
  "futures-core",
+ "futures-io",
  "futures-sink",
  "pin-project-lite",
  "tokio",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index e8bfc005d3..2cc59a947b 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -16,10 +16,11 @@ aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
+futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
-tokio-util.workspace = true
+tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index ae08e9b171..e559d00ded 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,21 +1,24 @@
 //! Azure Blob Storage wrapper
 
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
+use std::pin::Pin;
 use std::sync::Arc;
-use std::{borrow::Cow, io::Cursor};
 
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use bytes::Bytes;
+use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::StatusCode;
-use tokio::io::AsyncRead;
 use tracing::debug;
 
 use crate::s3_bucket::RequestKind;
@@ -49,7 +52,8 @@ impl AzureBlobStorage {
             StorageCredentials::token_credential(Arc::new(token_credential))
         };
 
-        let builder = ClientBuilder::new(account, credentials);
+        // we have an outer retry
+        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
 
         let client = builder.container_client(azure_config.container_name.to_owned());
 
@@ -116,7 +120,8 @@ impl AzureBlobStorage {
         let mut metadata = HashMap::new();
         // TODO give proper streaming response instead of buffering into RAM
         // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::new();
+
+        let mut bufs = Vec::new();
         while let Some(part) = response.next().await {
             let part = part.map_err(to_download_error)?;
             if let Some(blob_meta) = part.blob.metadata {
@@ -127,10 +132,10 @@ impl AzureBlobStorage {
                 .collect()
                 .await
                 .map_err(|e| DownloadError::Other(e.into()))?;
-            buf.extend_from_slice(&data.slice(..));
+            bufs.push(data);
         }
         Ok(Download {
-            download_stream: Box::pin(Cursor::new(buf)),
+            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
             metadata: Some(StorageMetadata(metadata)),
         })
     }
@@ -217,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
         }
         Ok(res)
     }
+
     async fn upload(
         &self,
-        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
@@ -227,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
         let _permit = self.permit(RequestKind::Put).await;
         let blob_client = self.client.blob_client(self.relative_path_to_name(to));
 
-        // TODO FIX THIS UGLY HACK and don't buffer the entire object
-        // into RAM here, but use the streaming interface. For that,
-        // we'd have to change the interface though...
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::with_capacity(data_size_bytes);
-        tokio::io::copy(&mut from, &mut buf).await?;
-        let body = azure_core::Body::Bytes(buf.into());
+        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+            Box::pin(from);
+
+        let from = NonSeekableStream::new(from, data_size_bytes);
+
+        let body = azure_core::Body::SeekableStream(Box::new(from));
 
         let mut builder = blob_client.put_block_blob(body);
 
@@ -312,3 +317,153 @@ impl RemoteStorage for AzureBlobStorage {
         Ok(())
     }
 }
+
+pin_project_lite::pin_project! {
+    /// Hack to work around not being able to stream once with azure sdk.
+    ///
+    /// Azure sdk clones streams around with the assumption that they are like
+    /// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
+    /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
+    /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
+    /// seekable, but we can also just re-try the request easier.
+    #[project = NonSeekableStreamProj]
+    enum NonSeekableStream<S> {
+        /// A stream wrappers initial form.
+        ///
+        /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
+        /// clone before first request, then this must be changed.
+        Initial {
+            inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
+            len: usize,
+        },
+        /// The actually readable variant, produced by cloning the Initial variant.
+        ///
+        /// The sdk currently always clones once, even without retry policy.
+        Actual {
+            #[pin]
+            inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
+            len: usize,
+            read_any: bool,
+        },
+        /// Most likely unneeded, but left to make life easier, in case more clones are added.
+        Cloned {
+            len_was: usize,
+        }
+    }
+}
+
+impl<S> NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+{
+    fn new(inner: S, len: usize) -> NonSeekableStream<S> {
+        use tokio_util::compat::TokioAsyncReadCompatExt;
+
+        let inner = tokio_util::io::StreamReader::new(inner).compat();
+        let inner = Some(inner);
+        let inner = std::sync::Mutex::new(inner);
+        NonSeekableStream::Initial { inner, len }
+    }
+}
+
+impl<S> std::fmt::Debug for NonSeekableStream<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
+            Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
+            Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
+        }
+    }
+}
+
+impl<S> futures::io::AsyncRead for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut [u8],
+    ) -> std::task::Poll<std::io::Result<usize>> {
+        match self.project() {
+            NonSeekableStreamProj::Actual {
+                inner, read_any, ..
+            } => {
+                *read_any = true;
+                inner.poll_read(cx, buf)
+            }
+            // NonSeekableStream::Initial does not support reading because it is just much easier
+            // to have the mutex in place where one does not poll the contents, or that's how it
+            // seemed originally. If there is a version upgrade which changes the cloning, then
+            // that support needs to be hacked in.
+            //
+            // including {self:?} into the message would be useful, but unsure how to unproject.
+            _ => std::task::Poll::Ready(Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "cloned or initial values cannot be read",
+            ))),
+        }
+    }
+}
+
+impl<S> Clone for NonSeekableStream<S> {
+    /// Weird clone implementation exists to support the sdk doing cloning before issuing the first
+    /// request, see type documentation.
+    fn clone(&self) -> Self {
+        use NonSeekableStream::*;
+
+        match self {
+            Initial { inner, len } => {
+                if let Some(inner) = inner.lock().unwrap().take() {
+                    Actual {
+                        inner,
+                        len: *len,
+                        read_any: false,
+                    }
+                } else {
+                    Self::Cloned { len_was: *len }
+                }
+            }
+            Actual { len, .. } => Cloned { len_was: *len },
+            Cloned { len_was } => Cloned { len_was: *len_was },
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl<S> azure_core::SeekableStream for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
+{
+    async fn reset(&mut self) -> azure_core::error::Result<()> {
+        use NonSeekableStream::*;
+
+        let msg = match self {
+            Initial { inner, .. } => {
+                if inner.get_mut().unwrap().is_some() {
+                    return Ok(());
+                } else {
+                    "reset after first clone is not supported"
+                }
+            }
+            Actual { read_any, .. } if !*read_any => return Ok(()),
+            Actual { .. } => "reset after reading is not supported",
+            Cloned { .. } => "reset after second clone is not supported",
+        };
+        Err(azure_core::error::Error::new(
+            azure_core::error::ErrorKind::Io,
+            std::io::Error::new(std::io::ErrorKind::Other, msg),
+        ))
+    }
+
+    // Note: it is not documented if this should be the total or remaining length, total passes the
+    // tests.
+    fn len(&self) -> usize {
+        use NonSeekableStream::*;
+        match self {
+            Initial { len, .. } => *len,
+            Actual { len, .. } => *len,
+            Cloned { len_was, .. } => *len_was,
+        }
+    }
+}
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e6d306ff66..e77c54e1e7 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,8 +19,10 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
 
+use bytes::Bytes;
+use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
-use tokio::{io, sync::Semaphore};
+use tokio::sync::Semaphore;
 use toml_edit::Item;
 use tracing::info;
 
@@ -179,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
         &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         // S3 PUT request requires the content length to be specified,
         // otherwise it starts to fail with the concurrent connection count increasing.
         data_size_bytes: usize,
@@ -206,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }
 
 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
@@ -300,7 +302,7 @@ impl GenericRemoteStorage {
 
     pub async fn upload(
         &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
@@ -398,7 +400,7 @@ impl GenericRemoteStorage {
     /// this path is used for the remote object id conversion only.
     pub async fn upload_storage_object(
         &self,
-        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         from_size_bytes: usize,
         to: &RemotePath,
     ) -> anyhow::Result<()> {
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index fccc78de20..0016c21955 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,11 +7,14 @@
 use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
 
 use anyhow::{bail, ensure, Context};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::stream::Stream;
 use tokio::{
     fs,
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
+use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
@@ -219,7 +222,7 @@ impl RemoteStorage for LocalFs {
 
     async fn upload(
         &self,
-        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
         data_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
@@ -252,8 +255,11 @@ impl RemoteStorage for LocalFs {
         );
 
         let from_size_bytes = data_size_bytes as u64;
+        let data = tokio_util::io::StreamReader::new(data);
+        let data = std::pin::pin!(data);
         let mut buffer_to_read = data.take(from_size_bytes);
 
+        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
         let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
             .await
             .with_context(|| {
@@ -308,7 +314,7 @@ impl RemoteStorage for LocalFs {
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
         let target_path = from.with_base(&self.storage_root);
         if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = io::BufReader::new(
+            let source = ReaderStream::new(
                 fs::OpenOptions::new()
                     .read(true)
                     .open(&target_path)
@@ -348,16 +354,14 @@ impl RemoteStorage for LocalFs {
         }
         let target_path = from.with_base(&self.storage_root);
         if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = io::BufReader::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
             source
                 .seek(io::SeekFrom::Start(start_inclusive))
                 .await
@@ -371,11 +375,13 @@ impl RemoteStorage for LocalFs {
             Ok(match end_exclusive {
                 Some(end_exclusive) => Download {
                     metadata,
-                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                    download_stream: Box::pin(ReaderStream::new(
+                        source.take(end_exclusive - start_inclusive),
+                    )),
                 },
                 None => Download {
                     metadata,
-                    download_stream: Box::pin(source),
+                    download_stream: Box::pin(ReaderStream::new(source)),
                 },
             })
         } else {
@@ -475,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
     use super::*;
 
+    use bytes::Bytes;
     use camino_tempfile::tempdir;
+    use futures_util::Stream;
     use std::{collections::HashMap, io::Write};
 
     async fn read_and_assert_remote_file_contents(
@@ -485,7 +493,7 @@ mod fs_tests {
         remote_storage_path: &RemotePath,
         expected_metadata: Option<&StorageMetadata>,
     ) -> anyhow::Result<String> {
-        let mut download = storage
+        let download = storage
             .download(remote_storage_path)
             .await
             .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -494,13 +502,9 @@ mod fs_tests {
             "Unexpected metadata returned for the downloaded file"
         );
 
-        let mut contents = String::new();
-        download
-            .download_stream
-            .read_to_string(&mut contents)
-            .await
-            .context("Failed to read remote file contents into string")?;
-        Ok(contents)
+        let contents = aggregate(download.download_stream).await?;
+
+        String::from_utf8(contents).map_err(anyhow::Error::new)
     }
 
     #[tokio::test]
@@ -529,25 +533,26 @@ mod fs_tests {
         let storage = create_storage()?;
 
         let id = RemotePath::new(Utf8Path::new("dummy"))?;
-        let content = std::io::Cursor::new(b"12345");
+        let content = Bytes::from_static(b"12345");
+        let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
 
         // Check that you get an error if the size parameter doesn't match the actual
         // size of the stream.
         storage
-            .upload(Box::new(content.clone()), 0, &id, None)
+            .upload(content(), 0, &id, None)
             .await
             .expect_err("upload with zero size succeeded");
         storage
-            .upload(Box::new(content.clone()), 4, &id, None)
+            .upload(content(), 4, &id, None)
             .await
             .expect_err("upload with too short size succeeded");
         storage
-            .upload(Box::new(content.clone()), 6, &id, None)
+            .upload(content(), 6, &id, None)
             .await
             .expect_err("upload with too large size succeeded");
 
         // Correct size is 5, this should succeed.
-        storage.upload(Box::new(content), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None).await?;
 
         Ok(())
     }
@@ -595,7 +600,7 @@ mod fs_tests {
         let uploaded_bytes = dummy_contents(upload_name).into_bytes();
         let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
 
-        let mut first_part_download = storage
+        let first_part_download = storage
             .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
             .await?;
         assert!(
@@ -603,21 +608,13 @@ mod fs_tests {
             "No metadata should be returned for no metadata upload"
         );
 
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut first_part_download.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(first_part_download.download_stream).await?;
         assert_eq!(
-            first_part_local,
-            first_part_remote.as_slice(),
+            first_part_local, first_part_remote,
             "First part bytes should be returned when requested"
         );
 
-        let mut second_part_download = storage
+        let second_part_download = storage
             .download_byte_range(
                 &upload_target,
                 first_part_local.len() as u64,
@@ -629,17 +626,9 @@ mod fs_tests {
             "No metadata should be returned for no metadata upload"
         );
 
-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut second_part_download.download_stream,
-            &mut second_part_remote,
-        )
-        .await?;
-        second_part_remote.flush().await?;
-        let second_part_remote = second_part_remote.into_inner().into_inner();
+        let second_part_remote = aggregate(second_part_download.download_stream).await?;
         assert_eq!(
-            second_part_local,
-            second_part_remote.as_slice(),
+            second_part_local, second_part_remote,
             "Second part bytes should be returned when requested"
         );
 
@@ -729,17 +718,10 @@ mod fs_tests {
         let uploaded_bytes = dummy_contents(upload_name).into_bytes();
         let (first_part_local, _) = uploaded_bytes.split_at(3);
 
-        let mut partial_download_with_metadata = storage
+        let partial_download_with_metadata = storage
             .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
             .await?;
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut partial_download_with_metadata.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
         assert_eq!(
             first_part_local,
             first_part_remote.as_slice(),
@@ -815,16 +797,16 @@ mod fs_tests {
                 )
             })?;
 
-        storage
-            .upload(Box::new(file), size, &relative_path, metadata)
-            .await?;
+        let file = tokio_util::io::ReaderStream::new(file);
+
+        storage.upload(file, size, &relative_path, metadata).await?;
         Ok(relative_path)
     }
 
     async fn create_file_for_upload(
         path: &Utf8Path,
         contents: &str,
-    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
+    ) -> anyhow::Result<(fs::File, usize)> {
         std::fs::create_dir_all(path.parent().unwrap())?;
         let mut file_for_writing = std::fs::OpenOptions::new()
             .write(true)
@@ -834,7 +816,7 @@ mod fs_tests {
         drop(file_for_writing);
         let file_size = path.metadata()?.len() as usize;
         Ok((
-            io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
+            fs::OpenOptions::new().read(true).open(&path).await?,
             file_size,
         ))
     }
@@ -848,4 +830,16 @@ mod fs_tests {
         files.sort_by(|a, b| a.0.cmp(&b.0));
         Ok(files)
     }
+
+    async fn aggregate(
+        stream: impl Stream<Item = std::io::Result<Bytes>>,
+    ) -> anyhow::Result<Vec<u8>> {
+        use futures::stream::StreamExt;
+        let mut out = Vec::new();
+        let mut stream = std::pin::pin!(stream);
+        while let Some(res) = stream.next().await {
+            out.extend_from_slice(&res?[..]);
+        }
+        Ok(out)
+    }
 }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 3016a14ec9..97fa1bbf5b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,9 +4,14 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::{borrow::Cow, sync::Arc};
+use std::{
+    borrow::Cow,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
 
-use anyhow::Context;
+use anyhow::Context as _;
 use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
     imds::credentials::ImdsCredentialsProvider,
@@ -28,11 +33,10 @@ use aws_smithy_async::rt::sleep::TokioSleep;
 
 use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
+use bytes::Bytes;
+use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::io::{self, AsyncRead};
-use tokio_util::io::ReaderStream;
-use tracing::debug;
 
 use super::StorageMetadata;
 use crate::{
@@ -63,7 +67,7 @@ struct GetObjectRequest {
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
     pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        debug!(
+        tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
         );
@@ -225,12 +229,15 @@ impl S3Bucket {
         match get_object {
             Ok(object_output) => {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
+
+                let body = object_output.body;
+                let body = ByteStreamAsStream::from(body);
+                let body = PermitCarrying::new(permit, body);
+                let body = TimedDownload::new(started_at, body);
+
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
-                    ))),
+                    download_stream: Box::pin(body),
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -243,29 +250,55 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    struct ByteStreamAsStream {
+        #[pin]
+        inner: aws_smithy_types::byte_stream::ByteStream
+    }
+}
+
+impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
+    fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
+        ByteStreamAsStream { inner }
+    }
+}
+
+impl Stream for ByteStreamAsStream {
+    type Item = std::io::Result<Bytes>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // this does the std::io::ErrorKind::Other conversion
+        self.project().inner.poll_next(cx).map_err(|x| x.into())
+    }
+
+    // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
+    // sense and Stream::size_hint does not really
+}
+
 pin_project_lite::pin_project! {
     /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
+    struct PermitCarrying<S> {
         permit: tokio::sync::OwnedSemaphorePermit,
         #[pin]
         inner: S,
     }
 }
 
-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+impl<S> PermitCarrying<S> {
     fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
+        Self { permit, inner }
     }
 }
 
-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        this.inner.poll_read(cx, buf)
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
     }
 }
 
@@ -285,7 +318,7 @@ pin_project_lite::pin_project! {
     }
 }
 
-impl<S: AsyncRead> TimedDownload<S> {
+impl<S> TimedDownload<S> {
     fn new(started_at: std::time::Instant, inner: S) -> Self {
         TimedDownload {
             started_at,
@@ -295,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
     }
 }
 
-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        use std::task::ready;
+
         let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));
 
-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
+        let res = ready!(this.inner.poll_next(cx));
+        match &res {
+            Some(Ok(_)) => {}
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
         }
 
-        std::task::Poll::Ready(read)
+        Poll::Ready(res)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
     }
 }
 
@@ -403,7 +437,7 @@ impl RemoteStorage for S3Bucket {
 
     async fn upload(
         &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         from_size_bytes: usize,
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
@@ -413,7 +447,7 @@ impl RemoteStorage for S3Bucket {
 
         let started_at = start_measuring_requests(kind);
 
-        let body = Body::wrap_stream(ReaderStream::new(from));
+        let body = Body::wrap_stream(from);
         let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
 
         let res = self
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index cd13db1923..802b0db7f5 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use bytes::Bytes;
+use futures::stream::Stream;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {
 
     async fn upload(
         &self,
-        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
         // S3 PUT request requires the content length to be specified,
         // otherwise it starts to fail with the concurrent connection count increasing.
         data_size_bytes: usize,
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index b631079bc5..7327803198 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -7,7 +7,9 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
     AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
     let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
 
-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
 
-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
 
     ctx.client.delete_objects(&[path1, path2]).await?;
 
@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
     let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
-    let data = "remote blob data here".as_bytes();
-    let data_len = data.len() as u64;
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
 
-    ctx.client
-        .upload(std::io::Cursor::new(data), data.len(), &path, None)
-        .await?;
+    let (data, len) = wrap_stream(orig.clone());
 
-    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+    ctx.client.upload(data, len, &path, None).await?;
+
+    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
         let mut buf = Vec::new();
-        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
+        tokio::io::copy_buf(
+            &mut tokio_util::io::StreamReader::new(dl.download_stream),
+            &mut buf,
+        )
+        .await?;
         Ok(buf)
     }
     // Normal download request
     let dl = ctx.client.download(&path).await?;
     let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);
 
     // Full range (end specified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 0, Some(data_len))
+        .download_byte_range(&path, 0, Some(len as u64))
         .await?;
     let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);
 
     // partial range (end specified)
     let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
     let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..10]);
+    assert_eq!(&buf, &orig[4..10]);
 
     // partial range (end beyond real end)
     let dl = ctx
         .client
-        .download_byte_range(&path, 8, Some(data_len * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
         .await?;
     let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[8..]);
+    assert_eq!(&buf, &orig[8..]);
 
     // Partial range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 4, None).await?;
     let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..]);
+    assert_eq!(&buf, &orig[4..]);
 
     // Full range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 0, None).await?;
     let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
     ctx.client
@@ -504,11 +500,8 @@ async fn upload_azure_data(
             let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
             debug!("Creating remote item {i} at path {blob_path:?}");
 
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
 
             Ok::<_, anyhow::Error>((blob_prefix, blob_path))
         });
@@ -589,11 +582,8 @@ async fn upload_simple_azure_data(
             .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
             debug!("Creating remote item {i} at path {blob_path:?}");
 
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
 
             Ok::<_, anyhow::Error>(blob_path)
         });
@@ -622,3 +612,32 @@ async fn upload_simple_azure_data(
         ControlFlow::Continue(uploaded_blobs)
     }
 }
+
+// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
+// to binary
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 48f00e0106..ecd834e61c 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -7,7 +7,9 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
     let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
 
-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
 
-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
 
     ctx.client.delete_objects(&[path1, path2]).await?;
 
@@ -432,11 +425,9 @@ async fn upload_s3_data(
             let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
             debug!("Creating remote item {i} at path {blob_path:?}");
 
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
 
             Ok::<_, anyhow::Error>((blob_prefix, blob_path))
         });
@@ -517,11 +508,9 @@ async fn upload_simple_s3_data(
             .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
             debug!("Creating remote item {i} at path {blob_path:?}");
 
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
 
             Ok::<_, anyhow::Error>(blob_path)
         });
@@ -550,3 +539,30 @@ async fn upload_simple_s3_data(
         ControlFlow::Continue(uploaded_blobs)
     }
 }
+
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 548b173c0d..b8d6d0a321 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -77,8 +77,10 @@ async fn create_remote_delete_mark(
     let data: &[u8] = &[];
     backoff::retry(
         || async {
+            let data = bytes::Bytes::from_static(data);
+            let stream = futures::stream::once(futures::future::ready(Ok(data)));
             remote_storage
-                .upload(data, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None)
                 .await
         },
         |_e| false,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index deb5ea84a8..3356f55f34 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -75,12 +75,11 @@ pub async fn download_layer_file<'a>(
 
     let (mut destination_file, bytes_amount) = download_retry(
         || async {
-            // TODO: this doesn't use the cached fd for some reason?
-            let mut destination_file = fs::File::create(&temp_file_path)
+            let destination_file = tokio::fs::File::create(&temp_file_path)
                 .await
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
-            let mut download = storage
+            let download = storage
                 .download(&remote_path)
                 .await
                 .with_context(|| {
@@ -90,9 +89,14 @@ pub async fn download_layer_file<'a>(
                 })
                 .map_err(DownloadError::Other)?;
 
+            let mut destination_file =
+                tokio::io::BufWriter::with_capacity(8 * 1024, destination_file);
+
+            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+
             let bytes_amount = tokio::time::timeout(
                 MAX_DOWNLOAD_DURATION,
-                tokio::io::copy(&mut download.download_stream, &mut destination_file),
+                tokio::io::copy_buf(&mut reader, &mut destination_file),
             )
             .await
             .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
@@ -103,6 +107,8 @@ pub async fn download_layer_file<'a>(
             })
             .map_err(DownloadError::Other)?;
 
+            let destination_file = destination_file.into_inner();
+
             Ok((destination_file, bytes_amount))
         },
         &format!("download {remote_path:?}"),
@@ -220,20 +226,22 @@ async fn do_download_index_part(
     index_generation: Generation,
     cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
+    use futures::stream::StreamExt;
+
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let index_part_bytes = download_retry_forever(
         || async {
-            let mut index_part_download = storage.download(&remote_path).await?;
+            let index_part_download = storage.download(&remote_path).await?;
 
             let mut index_part_bytes = Vec::new();
-            tokio::io::copy(
-                &mut index_part_download.download_stream,
-                &mut index_part_bytes,
-            )
-            .await
-            .with_context(|| format!("download index part at {remote_path:?}"))
-            .map_err(DownloadError::Other)?;
+            let mut stream = std::pin::pin!(index_part_download.download_stream);
+            while let Some(chunk) = stream.next().await {
+                let chunk = chunk
+                    .with_context(|| format!("download index part at {remote_path:?}"))
+                    .map_err(DownloadError::Other)?;
+                index_part_bytes.extend_from_slice(&chunk[..]);
+            }
             Ok(index_part_bytes)
         },
         &format!("download {remote_path:?}"),
@@ -398,7 +406,7 @@ pub(crate) async fn download_initdb_tar_zst(
 
     let file = download_retry(
         || async {
-            let mut file = OpenOptions::new()
+            let file = OpenOptions::new()
                 .create(true)
                 .truncate(true)
                 .read(true)
@@ -408,13 +416,17 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let mut download = storage.download(&remote_path).await?;
+            let download = storage.download(&remote_path).await?;
+            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
+            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
 
-            tokio::io::copy(&mut download.download_stream, &mut file)
+            tokio::io::copy_buf(&mut download, &mut writer)
                 .await
                 .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
                 .map_err(DownloadError::Other)?;
 
+            let mut file = writer.into_inner();
+
             file.seek(std::io::SeekFrom::Start(0))
                 .await
                 .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 4ca4438003..0ec539a64e 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -41,11 +41,15 @@ pub(super) async fn upload_index_part<'a>(
         .to_s3_bytes()
         .context("serialize index part file into bytes")?;
     let index_part_size = index_part_bytes.len();
-    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
+    let index_part_bytes = bytes::Bytes::from(index_part_bytes);
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
     storage
-        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
+            index_part_size,
+            &remote_path,
+        )
         .await
         .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
@@ -101,8 +105,10 @@ pub(super) async fn upload_timeline_layer<'a>(
     let fs_size = usize::try_from(fs_size)
         .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
 
+    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, 8 * 1024);
+
     storage
-        .upload(source_file, fs_size, &storage_path, None)
+        .upload(reader, fs_size, &storage_path, None)
         .await
         .with_context(|| format!("upload layer from local path '{source_path}'"))?;
 
@@ -119,7 +125,8 @@ pub(crate) async fn upload_initdb_dir(
     tracing::trace!("uploading initdb dir");
 
     let size = initdb_dir.len();
-    let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
+
+    let bytes = futures::stream::once(futures::future::ready(Ok(initdb_dir)));
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
     storage
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 53fcd5ff07..cccb4ebd79 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -35,6 +35,7 @@ serde_with.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["fs"] }
+tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 toml_edit.workspace = true
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 22c68ce3c9..2e2cb11e3f 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -494,15 +494,13 @@ async fn backup_object(
         .as_ref()
         .unwrap();
 
-    let file = tokio::io::BufReader::new(
-        File::open(&source_file)
-            .await
-            .with_context(|| format!("Failed to open file {} for wal backup", source_file))?,
-    );
-
-    storage
-        .upload_storage_object(Box::new(file), size, target_file)
+    let file = File::open(&source_file)
         .await
+        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
+
+    let file = tokio_util::io::ReaderStream::with_capacity(file, 8 * 1024);
+
+    storage.upload_storage_object(file, size, target_file).await
 }
 
 pub async fn read_object(
@@ -524,5 +522,9 @@ pub async fn read_object(
             format!("Failed to open WAL segment download stream for remote path {file_path:?}")
         })?;
 
-    Ok(download.download_stream)
+    let reader = tokio_util::io::StreamReader::new(download.download_stream);
+
+    let reader = tokio::io::BufReader::with_capacity(8 * 1024, reader);
+
+    Ok(Box::pin(reader))
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3e46731adf..82945dfacb 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -65,7 +65,7 @@ subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
-tokio-util = { version = "0.7", features = ["codec", "io"] }
+tokio-util = { version = "0.7", features = ["codec", "compat", "io"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }

From f2892d3798067e788d81df356aa93286afa1d436 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 7 Dec 2023 17:17:33 +0100
Subject: [PATCH 047/202] virtual_file metrics: distinguish first and
 subsequent open() syscalls (#6066)

This helps with identifying thrashing.

I don't love the name, but, there is already "close-by-replace".

While reading the code, I also found a case where we waste
work in a cache pressure situation:
https://github.com/neondatabase/neon/issues/6065

refs https://github.com/neondatabase/cloud/issues/8351
---
 pageserver/src/metrics.rs      |  3 ++-
 pageserver/src/virtual_file.rs | 13 +++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b684c4fa90..3554a93ed9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -774,6 +774,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 )]
 pub(crate) enum StorageIoOperation {
     Open,
+    OpenAfterReplace,
     Close,
     CloseByReplace,
     Read,
@@ -787,6 +788,7 @@ impl StorageIoOperation {
     pub fn as_str(&self) -> &'static str {
         match self {
             StorageIoOperation::Open => "open",
+            StorageIoOperation::OpenAfterReplace => "open-after-replace",
             StorageIoOperation::Close => "close",
             StorageIoOperation::CloseByReplace => "close-by-replace",
             StorageIoOperation::Read => "read",
@@ -840,7 +842,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     )
     .expect("failed to define a metric")
 });
-
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
     global: Histogram,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 7a6443361e..24efd14f84 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -288,6 +288,9 @@ impl VirtualFile {
         }
         let (handle, mut slot_guard) = get_open_files().find_victim_slot();
 
+        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
+        // where our caller doesn't get to use the returned VirtualFile before its
+        // slot gets re-used by someone else.
         let file = STORAGE_IO_TIME_METRIC
             .get(StorageIoOperation::Open)
             .observe_closure_duration(|| open_options.open(path))?;
@@ -311,6 +314,9 @@ impl VirtualFile {
             timeline_id,
         };
 
+        // TODO: Under pressure, it's likely the slot will get re-used and
+        // the underlying file closed before they get around to using it.
+        // => https://github.com/neondatabase/neon/issues/6065
         slot_guard.file.replace(file);
 
         Ok(vfile)
@@ -421,9 +427,12 @@ impl VirtualFile {
         // now locked in write-mode. Find a free slot to put it in.
         let (handle, mut slot_guard) = open_files.find_victim_slot();
 
-        // Open the physical file
+        // Re-open the physical file.
+        // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
+        // case from StorageIoOperation::Open. This helps with identifying thrashing
+        // of the virtual file descriptor cache.
         let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::Open)
+            .get(StorageIoOperation::OpenAfterReplace)
             .observe_closure_duration(|| self.open_options.open(&self.path))?;
 
         // Perform the requested operation on it

From 7aa1e583010dc5a116ebd8a27c8c7b7cbc385c70 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 7 Dec 2023 10:15:36 -0600
Subject: [PATCH 048/202] Add support for Python 3.12

---
 poetry.lock    | 121 ++++++++++++++++++++++++++++---------------------
 pyproject.toml |   4 +-
 2 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index a85325b696..8583a71f85 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"]
 
 [[package]]
 name = "aiopg"
-version = "1.3.4"
+version = "1.4.0"
 description = "Postgres integration with asyncio."
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
-    {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
+    {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
+    {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
 ]
 
 [package.dependencies]
 async-timeout = ">=3.0,<5.0"
-psycopg2-binary = ">=2.8.4"
+psycopg2-binary = ">=2.9.5"
 
 [package.extras]
 sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
@@ -160,64 +160,71 @@ pluggy = ">=0.4.0"
 
 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
+    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]
 
 [[package]]
 name = "asyncpg"
-version = "0.27.0"
+version = "0.29.0"
 description = "An asyncio PostgreSQL driver"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
 files = [
-    {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
-    {file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"},
-    {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"},
-    {file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"},
-    {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"},
-    {file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"},
-    {file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"},
-    {file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"},
-    {file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"},
-    {file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"},
-    {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"},
-    {file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"},
-    {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"},
-    {file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"},
-    {file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"},
-    {file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"},
-    {file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"},
-    {file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"},
-    {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"},
-    {file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"},
-    {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"},
-    {file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"},
-    {file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"},
-    {file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"},
-    {file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"},
-    {file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"},
-    {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"},
-    {file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"},
-    {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"},
-    {file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"},
-    {file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"},
-    {file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"},
-    {file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"},
+    {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
+    {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
+    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
+    {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
+    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
+    {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
+    {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
+    {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
+    {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
+    {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
+    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
+    {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
+    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
+    {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
+    {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
+    {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
+    {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
+    {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
+    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
+    {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
+    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
+    {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
+    {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
+    {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
+    {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
+    {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
+    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
+    {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
+    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
+    {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
+    {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
+    {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
+    {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
+    {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
+    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
+    {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
+    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
+    {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
+    {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
+    {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
+    {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
 ]
 
+[package.dependencies]
+async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
+
 [package.extras]
-dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"]
-docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
-test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]
+docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
+test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]
 
 [[package]]
 name = "attrs"
@@ -2476,6 +2483,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2697,4 +2714,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
+content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
diff --git a/pyproject.toml b/pyproject.toml
index 536efeab56..401acaeba4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,8 +12,8 @@ typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.31.0"
 pytest-xdist = "^3.3.1"
-asyncpg = "^0.27.0"
-aiopg = "^1.3.1"
+asyncpg = "^0.29.0"
+aiopg = "^1.4.0"
 Jinja2 = "^3.0.2"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"

From 37fdbc3aaa85e76ec146b12478fbc3a259cebf4b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Dec 2023 21:36:44 +0200
Subject: [PATCH 049/202] fix: use larger buffers for remote storage (#6069)

Currently using 8kB buffers, raise that to 32kB to hopefully 1/4 of
`spawn_blocking` usage. Also a drive-by fixing of last `tokio::io::copy`
to `tokio::io::copy_buf`.
---
 libs/remote_storage/src/local_fs.rs                      | 2 +-
 pageserver/src/tenant/remote_timeline_client.rs          | 3 +++
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 pageserver/src/tenant/remote_timeline_client/upload.rs   | 2 +-
 safekeeper/src/wal_backup.rs                             | 7 +++++--
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 0016c21955..03b98e5ea2 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -260,7 +260,7 @@ impl RemoteStorage for LocalFs {
         let mut buffer_to_read = data.take(from_size_bytes);
 
         // alternatively we could just write the bytes to a file, but local_fs is a testing utility
-        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
+        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
             .await
             .with_context(|| {
                 format!(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 5b649a420c..1ef9fe4a64 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -254,6 +254,9 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
 
+/// Default buffer size when interfacing with [`tokio::fs::File`].
+const BUFFER_SIZE: usize = 32 * 1024;
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 3356f55f34..ce942b56f8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -90,7 +90,7 @@ pub async fn download_layer_file<'a>(
                 .map_err(DownloadError::Other)?;
 
             let mut destination_file =
-                tokio::io::BufWriter::with_capacity(8 * 1024, destination_file);
+                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 0ec539a64e..e1dea3ab4b 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -105,7 +105,7 @@ pub(super) async fn upload_timeline_layer<'a>(
     let fs_size = usize::try_from(fs_size)
         .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
 
-    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, 8 * 1024);
+    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
     storage
         .upload(reader, fs_size, &storage_path, None)
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 2e2cb11e3f..c99bbc7d61 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -35,6 +35,9 @@ use once_cell::sync::OnceCell;
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 
+/// Default buffer size when interfacing with [`tokio::fs::File`].
+const BUFFER_SIZE: usize = 32 * 1024;
+
 /// Check whether wal backup is required for timeline. If yes, mark that launcher is
 /// aware of current status and return the timeline.
 async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
@@ -498,7 +501,7 @@ async fn backup_object(
         .await
         .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
 
-    let file = tokio_util::io::ReaderStream::with_capacity(file, 8 * 1024);
+    let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);
 
     storage.upload_storage_object(file, size, target_file).await
 }
@@ -524,7 +527,7 @@ pub async fn read_object(
 
     let reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-    let reader = tokio::io::BufReader::with_capacity(8 * 1024, reader);
+    let reader = tokio::io::BufReader::with_capacity(BUFFER_SIZE, reader);
 
     Ok(Box::pin(reader))
 }

From 7914eaf1e6c91321608ca3d098b0110a9f46a36c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 8 Dec 2023 03:33:44 +0100
Subject: [PATCH 050/202] Buffer initdb.tar.zst to a temporary file before
 upload (#5944)

In https://github.com/neondatabase/neon/pull/5912#pullrequestreview-1749982732 , Christian liked the idea of using files instead of buffering the
archive to RAM for the *download* path. This is for the upload path,
which is a very similar situation.
---
 pageserver/src/import_datadir.rs              | 89 ++++---------------
 pageserver/src/tenant.rs                      | 50 ++++++++---
 .../src/tenant/remote_timeline_client.rs      |  2 +-
 .../tenant/remote_timeline_client/download.rs | 12 +--
 .../tenant/remote_timeline_client/upload.rs   | 12 ++-
 5 files changed, 68 insertions(+), 97 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 452cd73f76..d95d75449d 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,9 +2,8 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
+use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
-use std::pin::Pin;
-use std::task::{self, Poll};
 
 use anyhow::{bail, ensure, Context, Result};
 use async_compression::tokio::bufread::ZstdDecoder;
@@ -13,7 +12,8 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use nix::NixPath;
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+use tokio::fs::{File, OpenOptions};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
@@ -629,70 +629,16 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
     Ok(Bytes::from(buf))
 }
 
-/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
-///
-/// The number of yields is bounded by above by the number of times poll_write is called,
-/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
-/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
-/// breathing room between units of CPU intensive preparation of buffers to be written.
-/// Once a write call is issued, the whole buffer has been prepared already, so there is no
-/// gain in splitting up the memcopy further.
-struct YieldingVec {
-    yield_budget: usize,
-    // the buffer written into
-    buf: Vec<u8>,
-}
+pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tmp_path)
+        .await
+        .with_context(|| format!("tempfile creation {tmp_path}"))?;
 
-impl YieldingVec {
-    fn new() -> Self {
-        Self {
-            yield_budget: 0,
-            buf: Vec::new(),
-        }
-    }
-    // Whether we should yield for a read operation of given size
-    fn should_yield(&mut self, add_buf_len: usize) -> bool {
-        // Set this limit to a small value so that we are a
-        // good async citizen and yield repeatedly (but not
-        // too often for many small writes to cause many yields)
-        const YIELD_DIST: usize = 1024;
-
-        let target_buf_len = self.buf.len() + add_buf_len;
-        let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
-        if self.yield_budget < target_buf_len {
-            self.yield_budget += add_buf_len;
-        }
-        ret
-    }
-}
-
-impl AsyncWrite for YieldingVec {
-    fn poll_write(
-        mut self: Pin<&mut Self>,
-        cx: &mut task::Context<'_>,
-        buf: &[u8],
-    ) -> Poll<std::io::Result<usize>> {
-        if self.should_yield(buf.len()) {
-            cx.waker().wake_by_ref();
-            return Poll::Pending;
-        }
-        self.get_mut().buf.extend_from_slice(buf);
-        Poll::Ready(Ok(buf.len()))
-    }
-
-    fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
-        Poll::Ready(Ok(()))
-    }
-
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        _cx: &mut task::Context<'_>,
-    ) -> Poll<std::io::Result<()>> {
-        Poll::Ready(Ok(()))
-    }
-}
-
-pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
     let mut paths = Vec::new();
     for entry in WalkDir::new(pgdata_path) {
         let entry = entry?;
@@ -707,7 +653,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
     // Do a sort to get a more consistent listing
     paths.sort_unstable();
     let zstd = ZstdEncoder::with_quality_and_params(
-        YieldingVec::new(),
+        file,
         Level::Default,
         &[CParameter::enable_long_distance_matching(true)],
     );
@@ -725,13 +671,14 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
     }
     let mut zstd = builder.into_inner().await?;
     zstd.shutdown().await?;
-    let compressed = zstd.into_inner();
-    let compressed_len = compressed.buf.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
     if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
         warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
     }
-    Ok(compressed.buf)
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
 }
 
 pub async fn extract_tar_zst(
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d756e13316..48f71d7747 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,7 +12,6 @@
 //!
 
 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
@@ -69,6 +68,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
+use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
@@ -2949,10 +2949,10 @@ impl Tenant {
         };
         // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
         // temporary directory for basebackup files for the given timeline.
+
+        let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
         let pgdata_path = path_with_suffix_extension(
-            self.conf
-                .timelines_path(&self.tenant_shard_id)
-                .join(format!("basebackup-{timeline_id}")),
+            timelines_path.join(format!("basebackup-{timeline_id}")),
             TEMP_FILE_SUFFIX,
         );
 
@@ -2983,31 +2983,43 @@ impl Tenant {
                 )
                 .await
                 .context("download initdb tar")?;
-            let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
+            let buf_read =
+                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
             import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                 .await
                 .context("extract initdb tar")?;
 
-            if initdb_tar_zst_path.exists() {
-                tokio::fs::remove_file(&initdb_tar_zst_path)
-                    .await
-                    .context("tempfile removal")?;
-            }
+            tokio::fs::remove_file(&initdb_tar_zst_path)
+                .await
+                .or_else(|e| {
+                    if e.kind() == std::io::ErrorKind::NotFound {
+                        // If something else already removed the file, ignore the error
+                        Ok(())
+                    } else {
+                        Err(e)
+                    }
+                })
+                .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
         } else {
             // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
             run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
 
             // Upload the created data dir to S3
             if let Some(storage) = &self.remote_storage {
-                let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
-                let pgdata_zstd = Bytes::from(pgdata_zstd);
+                let temp_path = timelines_path.join(format!(
+                    "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
+                ));
+
+                let (pgdata_zstd, tar_zst_size) =
+                    import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
                 backoff::retry(
                     || async {
                         self::remote_timeline_client::upload_initdb_dir(
                             storage,
                             &self.tenant_shard_id.tenant_id,
                             &timeline_id,
-                            pgdata_zstd.clone(),
+                            pgdata_zstd.try_clone().await?,
+                            tar_zst_size,
                         )
                         .await
                     },
@@ -3019,6 +3031,18 @@ impl Tenant {
                     backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
                 )
                 .await?;
+
+                tokio::fs::remove_file(&temp_path)
+                    .await
+                    .or_else(|e| {
+                        if e.kind() == std::io::ErrorKind::NotFound {
+                            // If something else already removed the file, ignore the error
+                            Ok(())
+                        } else {
+                            Err(e)
+                        }
+                    })
+                    .with_context(|| format!("tempfile removal {temp_path}"))?;
             }
         }
         let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1ef9fe4a64..03600cf5ae 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -255,7 +255,7 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
 
 /// Default buffer size when interfacing with [`tokio::fs::File`].
-const BUFFER_SIZE: usize = 32 * 1024;
+pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index ce942b56f8..ed32c4eed9 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -402,7 +402,9 @@ pub(crate) async fn download_initdb_tar_zst(
             .with_context(|| format!("timeline dir creation {timeline_path}"))
             .map_err(DownloadError::Other)?;
     }
-    let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
+    let temp_path = timeline_path.join(format!(
+        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
+    ));
 
     let file = download_retry(
         || async {
@@ -438,10 +440,10 @@ pub(crate) async fn download_initdb_tar_zst(
     )
     .await
     .map_err(|e| {
-        if temp_path.exists() {
-            // Do a best-effort attempt at deleting the temporary file upon encountering an error.
-            // We don't have async here nor do we want to pile on any extra errors.
-            if let Err(e) = std::fs::remove_file(&temp_path) {
+        // Do a best-effort attempt at deleting the temporary file upon encountering an error.
+        // We don't have async here nor do we want to pile on any extra errors.
+        if let Err(e) = std::fs::remove_file(&temp_path) {
+            if e.kind() != std::io::ErrorKind::NotFound {
                 warn!("error deleting temporary file {temp_path}: {e}");
             }
         }
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e1dea3ab4b..d0744e7c83 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,12 +1,11 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage
 
 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
 use std::io::ErrorKind;
-use tokio::fs;
+use tokio::fs::{self, File};
 
 use super::Generation;
 use crate::{
@@ -120,17 +119,16 @@ pub(crate) async fn upload_initdb_dir(
     storage: &GenericRemoteStorage,
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
-    initdb_dir: Bytes,
+    initdb_tar_zst: File,
+    size: u64,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading initdb dir");
 
-    let size = initdb_dir.len();
-
-    let bytes = futures::stream::once(futures::future::ready(Ok(initdb_dir)));
+    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
     storage
-        .upload_storage_object(bytes, size, &remote_path)
+        .upload_storage_object(file, size as usize, &remote_path)
         .await
         .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }

From d9d8e9afc7150a89825d5fda52482bcb36d40c0f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 8 Dec 2023 08:39:45 +0000
Subject: [PATCH 051/202] test_tenant_reattach: fix reattach mode names (#6070)

## Problem

Ref
https://neondb.slack.com/archives/C033QLM5P7D/p1701987609146109?thread_ts=1701976393.757279&cid=C033QLM5P7D

## Summary of changes
- Make reattach mode names unique for `test_tenant_reattach`
---
 test_runner/regress/test_tenant_detach.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 5c57fb233b..4a40982d78 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -55,7 +55,7 @@ def do_gc_target(
 class ReattachMode(str, enum.Enum):
     REATTACH_EXPLICIT = "explicit"
     REATTACH_RESET = "reset"
-    REATTACH_RESET_DROP = "reset"
+    REATTACH_RESET_DROP = "reset_drop"
 
 
 # Basic detach and re-attach test

From 65f48c7002d919464d4e141177fbe203c99d93b5 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 21 Oct 2023 22:47:20 +0300
Subject: [PATCH 052/202] Make targets to run pgindent on core and neon
 extension.

---
 .gitignore         |  3 +++
 Makefile           | 38 ++++++++++++++++++++++++++++++++++++++
 pgxn/neon/Makefile | 11 +++++++++++
 3 files changed, 52 insertions(+)

diff --git a/.gitignore b/.gitignore
index f1afdee599..c5fc121ac2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,6 @@ test_output/
 *.o
 *.so
 *.Po
+
+# pgindent typedef lists
+*.list
diff --git a/Makefile b/Makefile
index 89acbe564a..004ca3fbcf 100644
--- a/Makefile
+++ b/Makefile
@@ -260,6 +260,44 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace
 
+postgres-%-pg-bsd-indent: postgres-%
+	+@echo "Compiling pg_bsd_indent"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
+
+# Create typedef list for the core. Note that generally it should be combined with
+# buildfarm one to cover platform specific stuff.
+# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
+postgres-%-typedefs.list: postgres-%
+	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
+
+# Indent postgres. See src/tools/pgindent/README for details.
+.PHONY: postgres-%-pgindent
+postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
+	+@echo merge with buildfarm typedef to cover all platforms
+	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
+		REL_16_STABLE list misses PGSemaphoreData
+	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
+	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
+		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	+@echo note: you might want to run it on selected files/dirs instead.
+	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
+		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
+	rm -f pg*.BAK
+
+# Indent pxgn/neon.
+.PHONY: pgindent
+neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
+
+
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7fc99523db..466e346e46 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -41,6 +41,17 @@ libwalproposer.a: $(WALPROP_OBJS)
 	rm -f $@
 	$(AR) $(AROPT) $@ $^
 
+# needs vars:
+# FIND_TYPEDEF pointing to find_typedef
+# INDENT pointing to pg_bsd_indent
+# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
+#   pgindent will pick it up as pg_bsd_indent path).
+.PHONY: pgindent
+pgindent:
+	+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
+	$(FIND_TYPEDEF) . > neon.typedefs
+	INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
+
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)

From 1bbd6cae24fabfefbb573a3fa05a3e809d0d7712 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 7 Dec 2023 17:24:00 +0300
Subject: [PATCH 053/202] pgindent pgxn/neon

---
 pgxn/neon/control_plane_connector.c |  19 +-
 pgxn/neon/extension_server.c        | 117 ++++-----
 pgxn/neon/file_cache.c              | 249 ++++++++++---------
 pgxn/neon/libpagestore.c            | 143 +++++------
 pgxn/neon/neon.c                    |   6 +-
 pgxn/neon/neon.h                    |   2 +-
 pgxn/neon/neon_pgversioncompat.h    |   4 +-
 pgxn/neon/pagestore_client.h        |  48 ++--
 pgxn/neon/pagestore_smgr.c          | 360 ++++++++++++++++------------
 pgxn/neon/walproposer.c             | 142 +++++------
 pgxn/neon/walproposer.h             |  27 ++-
 pgxn/neon/walproposer_compat.c      |  14 +-
 pgxn/neon/walproposer_pg.c          |  33 +--
 13 files changed, 631 insertions(+), 533 deletions(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 2546e6de5e..2e7da671f9 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
 
 /* Curl structures for sending the HTTP requests */
-static CURL * CurlHandle;
+static CURL *CurlHandle;
 static struct curl_slist *ContentHeader = NULL;
 
 /*
@@ -54,7 +54,7 @@ typedef enum
 {
 	Op_Set,						/* An upsert: Either a creation or an alter */
 	Op_Delete,
-}			OpType;
+} OpType;
 
 typedef struct
 {
@@ -62,7 +62,7 @@ typedef struct
 	Oid			owner;
 	char		old_name[NAMEDATALEN];
 	OpType		type;
-}			DbEntry;
+} DbEntry;
 
 typedef struct
 {
@@ -70,7 +70,7 @@ typedef struct
 	char		old_name[NAMEDATALEN];
 	const char *password;
 	OpType		type;
-}			RoleEntry;
+} RoleEntry;
 
 /*
  * We keep one of these for each subtransaction in a stack. When a subtransaction
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
 	struct DdlHashTable *prev_table;
 	HTAB	   *db_table;
 	HTAB	   *role_table;
-}			DdlHashTable;
+} DdlHashTable;
 
 static DdlHashTable RootTable;
-static DdlHashTable * CurrentDdlTable = &RootTable;
+static DdlHashTable *CurrentDdlTable = &RootTable;
 
 static void
 PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -199,7 +199,7 @@ typedef struct
 {
 	char		str[ERROR_SIZE];
 	size_t		size;
-}			ErrorString;
+} ErrorString;
 
 static size_t
 ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
@@ -478,7 +478,7 @@ NeonXactCallback(XactEvent event, void *arg)
 static bool
 RoleIsNeonSuperuser(const char *role_name)
 {
-    return strcmp(role_name, "neon_superuser") == 0;
+	return strcmp(role_name, "neon_superuser") == 0;
 }
 
 static void
@@ -509,6 +509,7 @@ HandleCreateDb(CreatedbStmt *stmt)
 	if (downer && downer->arg)
 	{
 		const char *owner_name = defGetString(downer);
+
 		if (RoleIsNeonSuperuser(owner_name))
 			elog(ERROR, "can't create a database with owner neon_superuser");
 		entry->owner = get_role_oid(owner_name, false);
@@ -536,6 +537,7 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
 	if (!found)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
 	const char *new_owner = get_rolespec_name(stmt->newowner);
+
 	if (RoleIsNeonSuperuser(new_owner))
 		elog(ERROR, "can't alter owner to neon_superuser");
 	entry->owner = get_role_oid(new_owner, false);
@@ -633,6 +635,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	DefElem    *dpass = NULL;
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;
+
 	if (RoleIsNeonSuperuser(role_name))
 		elog(ERROR, "can't ALTER neon_superuser");
 
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 6053425de0..597eed8db5 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -25,79 +25,80 @@
 
 #include <curl/curl.h>
 
-static int extension_server_port = 0;
+static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
 
-// to download all SQL (and data) files for an extension:
-// curl -X POST http://localhost:8080/extension_server/postgis
-// it covers two possible extension files layouts:
-// 1. extension_name--version--platform.sql
-// 2. extension_name/extension_name--version.sql
-//    extension_name/extra_files.csv
-//
-// to download specific library file:
-// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+/*  to download all SQL (and data) files for an extension: */
+/*  curl -X POST http://localhost:8080/extension_server/postgis */
+/*  it covers two possible extension files layouts: */
+/*  1. extension_name--version--platform.sql */
+/*  2. extension_name/extension_name--version.sql */
+/*     extension_name/extra_files.csv */
+/*  */
+/*  to download specific library file: */
+/*  curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true */
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-    CURL *curl;
-    CURLcode res;
-    char *compute_ctl_url;
-    char *postdata;
-    bool ret = false;
+	CURL	   *curl;
+	CURLcode	res;
+	char	   *compute_ctl_url;
+	char	   *postdata;
+	bool		ret = false;
 
-    if ((curl = curl_easy_init()) == NULL)
-    {
-        elog(ERROR, "Failed to initialize curl handle");
-    }
+	if ((curl = curl_easy_init()) == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
 
-    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-                               extension_server_port, filename, is_library ? "?is_library=true" : "");
+	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+							   extension_server_port, filename, is_library ? "?is_library=true" : "");
 
-    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
 
-    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
+	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
 
-    if (curl)
-    {
-        /* Perform the request, res will get the return code */
-        res = curl_easy_perform(curl);
-        /* Check for errors */
-        if (res == CURLE_OK)
-        {
-            ret = true;
-        }
-        else
-        {
-            // Don't error here because postgres will try to find the file
-            // and will fail with some proper error message if it's not found.
-            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-        }
+	if (curl)
+	{
+		/* Perform the request, res will get the return code */
+		res = curl_easy_perform(curl);
+		/* Check for errors */
+		if (res == CURLE_OK)
+		{
+			ret = true;
+		}
+		else
+		{
+			/* Don't error here because postgres will try to find the file */
+			/* and will fail with some proper error message if it's not found. */
+			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+		}
 
-        /* always cleanup */
-        curl_easy_cleanup(curl);
-    }
+		/* always cleanup */
+		curl_easy_cleanup(curl);
+	}
 
-    return ret;
+	return ret;
 }
 
-void pg_init_extension_server()
+void
+pg_init_extension_server()
 {
-    // Port to connect to compute_ctl on localhost
-    // to request extension files.
-    DefineCustomIntVariable("neon.extension_server_port",
-                            "connection string to the compute_ctl",
-                            NULL,
-                            &extension_server_port,
-                            0, 0, INT_MAX,
-                            PGC_POSTMASTER,
-                            0, /* no flags required */
-                            NULL, NULL, NULL);
+	/* Port to connect to compute_ctl on localhost */
+	/* to request extension files. */
+	DefineCustomIntVariable("neon.extension_server_port",
+							"connection string to the compute_ctl",
+							NULL,
+							&extension_server_port,
+							0, 0, INT_MAX,
+							PGC_POSTMASTER,
+							0,	/* no flags required */
+							NULL, NULL, NULL);
 
-    // set download_extension_file_hook
-    prev_download_extension_file_hook = download_extension_file_hook;
-    download_extension_file_hook = neon_download_extension_file_http;
+	/* set download_extension_file_hook */
+	prev_download_extension_file_hook = download_extension_file_hook;
+	download_extension_file_hook = neon_download_extension_file_http;
 }
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index e70f0163c0..991b553b10 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -67,32 +67,34 @@
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
-	uint32      hash;
+	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK/32];
-	dlist_node	lru_node; /* LRU list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
+	dlist_node	lru_node;		/* LRU list node */
 } FileCacheEntry;
 
 typedef struct FileCacheControl
 {
-	uint64 generation; /* generation is needed to handle correct hash reenabling */
-	uint32 size; /* size of cache file in chunks */
-	uint32 used; /* number of used chunks */
-	uint32 limit; /* shared copy of lfc_size_limit */
-	uint64 hits;
-	uint64 misses;
-	uint64 writes;
-	dlist_head lru; /* double linked list for LRU replacement algorithm */
+	uint64		generation;		/* generation is needed to handle correct hash
+								 * reenabling */
+	uint32		size;			/* size of cache file in chunks */
+	uint32		used;			/* number of used chunks */
+	uint32		limit;			/* shared copy of lfc_size_limit */
+	uint64		hits;
+	uint64		misses;
+	uint64		writes;
+	dlist_head	lru;			/* double linked list for LRU replacement
+								 * algorithm */
 } FileCacheControl;
 
-static HTAB* lfc_hash;
-static int   lfc_desc = 0;
+static HTAB *lfc_hash;
+static int	lfc_desc = 0;
 static LWLockId lfc_lock;
-static int   lfc_max_size;
-static int   lfc_size_limit;
-static char* lfc_path;
-static  FileCacheControl* lfc_ctl;
+static int	lfc_max_size;
+static int	lfc_size_limit;
+static char *lfc_path;
+static FileCacheControl *lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
@@ -100,7 +102,7 @@ static shmem_request_hook_type prev_shmem_request_hook;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
-void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
+void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
 
 /*
  * Local file cache is optional and Neon can work without it.
@@ -109,9 +111,10 @@ void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
  * All cache content should be invalidated to avoid reading of stale or corrupted data
  */
 static void
-lfc_disable(char const* op)
+lfc_disable(char const *op)
 {
-	int fd;
+	int			fd;
+
 	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
 
 	/* Invalidate hash */
@@ -120,7 +123,7 @@ lfc_disable(char const* op)
 	if (LFC_ENABLED())
 	{
 		HASH_SEQ_STATUS status;
-		FileCacheEntry* entry;
+		FileCacheEntry *entry;
 
 		hash_seq_init(&status, lfc_hash);
 		while ((entry = hash_seq_search(&status)) != NULL)
@@ -135,16 +138,24 @@ lfc_disable(char const* op)
 
 		if (lfc_desc > 0)
 		{
-			/* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */
-			int rc = ftruncate(lfc_desc, 0);
+			/*
+			 * If the reason of error is ENOSPC, then truncation of file may
+			 * help to reclaim some space
+			 */
+			int			rc = ftruncate(lfc_desc, 0);
+
 			if (rc < 0)
 				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
 		}
 	}
-	/* We need to use unlink to to avoid races in LFC write, because it is not protectedby */
+
+	/*
+	 * We need to use unlink to to avoid races in LFC write, because it is not
+	 * protectedby
+	 */
 	unlink(lfc_path);
 
-	fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
+	fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 	if (fd < 0)
 		elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
 	else
@@ -170,13 +181,15 @@ lfc_maybe_disabled(void)
 static bool
 lfc_ensure_opened(void)
 {
-	bool enabled = !lfc_maybe_disabled();
+	bool		enabled = !lfc_maybe_disabled();
+
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0 && enabled)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
 
-		if (lfc_desc < 0) {
+		if (lfc_desc < 0)
+		{
 			lfc_disable("open");
 			return false;
 		}
@@ -187,7 +200,7 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
-	bool found;
+	bool		found;
 	static HASHCTL info;
 
 	if (prev_shmem_startup_hook)
@@ -197,17 +210,22 @@ lfc_shmem_startup(void)
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 
-	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
 	{
-		int fd;
-		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
-		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
+		int			fd;
+		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+
+		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
-								 lfc_size+1, lfc_size+1,
+
+		/*
+		 * lfc_size+1 because we add new element to hash table before eviction
+		 * of victim
+		 */
+								 lfc_size + 1, lfc_size + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -219,7 +237,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);
 
 		/* Recreate file cache on restart */
-		fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
+		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
 		if (fd < 0)
 		{
 			elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
@@ -242,7 +260,7 @@ lfc_shmem_request(void)
 		prev_shmem_request_hook();
 #endif
 
-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
 	RequestNamedLWLockTranche("lfc_lock", 1);
 }
 
@@ -250,9 +268,11 @@ static bool
 is_normal_backend(void)
 {
 	/*
-	 * Stats collector detach shared memory, so we should not try to access shared memory here.
-	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
-	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
+	 * Stats collector detach shared memory, so we should not try to access
+	 * shared memory here. Parallel workers first assign default value (0), so
+	 * not perform truncation in parallel workers. The Postmaster can handle
+	 * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
+	 * but has no PGPROC.
 	 */
 	return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
 }
@@ -271,7 +291,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 static void
 lfc_change_limit_hook(int newval, void *extra)
 {
-	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
+	uint32		new_size = SIZE_MB_TO_CHUNKS(newval);
 
 	if (!is_normal_backend())
 		return;
@@ -283,11 +303,15 @@ lfc_change_limit_hook(int newval, void *extra)
 
 	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
-		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
-		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		/*
+		 * Shrink cache by throwing away least recently accessed chunks and
+		 * returning their space to file system
+		 */
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			elog(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -314,7 +338,7 @@ lfc_init(void)
 							"Maximal size of Neon local file cache",
 							NULL,
 							&lfc_max_size,
-							0, /* disabled by default */
+							0,	/* disabled by default */
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
@@ -327,7 +351,7 @@ lfc_init(void)
 							"Current limit for size of Neon local file cache",
 							NULL,
 							&lfc_size_limit,
-							0, /* disabled by default */
+							0,	/* disabled by default */
 							0,
 							INT_MAX,
 							PGC_SIGHUP,
@@ -367,18 +391,18 @@ lfc_init(void)
 bool
 lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	bool found = false;
-	uint32 hash;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	bool		found = false;
+	uint32		hash;
 
-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -397,13 +421,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 void
 lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	bool found;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	uint32 hash;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	bool		found;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	uint32		hash;
 
-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -438,9 +462,10 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	 */
 	if (entry->bitmap[chunk_offs >> 5] == 0)
 	{
-		bool has_remaining_pages;
+		bool		has_remaining_pages;
 
-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
+		{
 			if (entry->bitmap[i] != 0)
 			{
 				has_remaining_pages = true;
@@ -449,8 +474,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		}
 
 		/*
-		 * Put the entry at the position that is first to be reclaimed when
-		 * we have no cached pages remaining in the chunk
+		 * Put the entry at the position that is first to be reclaimed when we
+		 * have no cached pages remaining in the chunk
 		 */
 		if (!has_remaining_pages)
 		{
@@ -476,16 +501,16 @@ bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 char *buffer)
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	ssize_t rc;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	bool result = true;
-	uint32 hash;
-	uint64 generation;
-	uint32 entry_offset;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	ssize_t		rc;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	bool		result = true;
+	uint32		hash;
+	uint64		generation;
+	uint32		entry_offset;
 
-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
 
 	if (!lfc_ensure_opened())
@@ -493,7 +518,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -520,7 +545,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	LWLockRelease(lfc_lock);
 
-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("read");
@@ -551,30 +576,30 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
  * If cache is full then evict some other page.
  */
 void
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+			lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-		  char *buffer)
+					  char *buffer)
 #else
-		  const void *buffer)
+					  const void *buffer)
 #endif
 {
-	BufferTag tag;
-	FileCacheEntry* entry;
-	ssize_t rc;
-	bool found;
-	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
-	uint32 hash;
-	uint64 generation;
-	uint32 entry_offset;
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	ssize_t		rc;
+	bool		found;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+	uint32		hash;
+	uint64		generation;
+	uint32		entry_offset;
 
-	if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
 
 	if (!lfc_ensure_opened())
 		return;
 
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	hash = get_hash_value(lfc_hash, &tag);
 
@@ -590,24 +615,30 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	if (found)
 	{
-		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		/*
+		 * Unlink entry from LRU list to pin it for the duration of IO
+		 * operation
+		 */
 		if (entry->access_count++ == 0)
 			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
 		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
-		 * 1. Wait until some of this operation is completed and pages is unpinned
-		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
-		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
-		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
+		 * We have two choices if all cache pages are pinned (i.e. used in IO
+		 * operations): 1. Wait until some of this operation is completed and
+		 * pages is unpinned 2. Allocate one more chunk, so that specified
+		 * cache size is more recommendation than hard limit. As far as
+		 * probability of such event (that all pages are pinned) is considered
+		 * to be very very small: there are should be very large number of
+		 * concurrent IO operations and them are limited by max_connections,
 		 * we prefer not to complicate code and use second approach.
 		 */
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -616,7 +647,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		else
 		{
 			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
+			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
+												 * of file */
 		}
 		entry->access_count = 1;
 		entry->hash = hash;
@@ -628,7 +660,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	lfc_ctl->writes += 1;
 	LWLockRelease(lfc_lock);
 
-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("write");
@@ -665,13 +697,13 @@ Datum
 neon_get_lfc_stats(PG_FUNCTION_ARGS)
 {
 	FuncCallContext *funcctx;
-	NeonGetStatsCtx* fctx;
+	NeonGetStatsCtx *fctx;
 	MemoryContext oldcontext;
 	TupleDesc	tupledesc;
 	Datum		result;
 	HeapTuple	tuple;
-	char const* key;
-	uint64      value;
+	char const *key;
+	uint64		value;
 	Datum		values[NUM_NEON_GET_STATS_COLS];
 	bool		nulls[NUM_NEON_GET_STATS_COLS];
 
@@ -683,7 +715,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
 		/* Create a user function context for cross-call persistence */
-		fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx));
+		fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
 
 		/* Construct a tuple descriptor for the result rows. */
 		tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
@@ -704,7 +736,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 	funcctx = SRF_PERCALL_SETUP();
 
 	/* Get the saved state */
-	fctx = (NeonGetStatsCtx*) funcctx->user_fctx;
+	fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
 
 	switch (funcctx->call_cntr)
 	{
@@ -792,9 +824,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
 
 	if (SRF_IS_FIRSTCALL())
 	{
-        HASH_SEQ_STATUS status;
-		FileCacheEntry* entry;
-		uint32 n_pages = 0;
+		HASH_SEQ_STATUS status;
+		FileCacheEntry *entry;
+		uint32		n_pages = 0;
 
 		funcctx = SRF_FIRSTCALL_INIT();
 
@@ -851,7 +883,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++)
+					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
@@ -870,10 +902,11 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		if (n_pages != 0)
 		{
 			/*
-			 * Scan through all the cache entries, saving the relevant fields in the
-			 * fctx->record structure.
+			 * Scan through all the cache entries, saving the relevant fields
+			 * in the fctx->record structure.
 			 */
-			uint32 n = 0;
+			uint32		n = 0;
+
 			hash_seq_init(&status, lfc_hash);
 			while ((entry = hash_seq_search(&status)) != NULL)
 			{
@@ -881,7 +914,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				{
 					if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 					{
-						fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
+						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
 						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
 						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
 						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 8eb9ebb915..16406ce8a3 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -69,9 +69,9 @@ int			max_reconnect_attempts = 60;
 
 typedef struct
 {
-    LWLockId lock;
-    pg_atomic_uint64 update_counter;
-    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+	LWLockId	lock;
+	pg_atomic_uint64 update_counter;
+	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;
 
 #if PG_VERSION_NUM >= 150000
@@ -83,7 +83,7 @@ static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
 
 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
@@ -91,43 +91,43 @@ static void pageserver_disconnect(void);
 static bool
 PagestoreShmemIsValid()
 {
-    return pagestore_shared && UsedShmemSegAddr;
+	return pagestore_shared && UsedShmemSegAddr;
 }
 
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
 }
 
 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-    if(!PagestoreShmemIsValid())
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    LWLockRelease(pagestore_shared->lock);
+	if (!PagestoreShmemIsValid())
+		return;
+	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
+	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
+	LWLockRelease(pagestore_shared->lock);
 }
 
 static bool
 CheckConnstringUpdated()
 {
-    if(!PagestoreShmemIsValid())
-        return false;
-    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+	if (!PagestoreShmemIsValid())
+		return false;
+	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
 }
 
 static void
 ReloadConnstring()
 {
-    if(!PagestoreShmemIsValid())
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    LWLockRelease(pagestore_shared->lock);
+	if (!PagestoreShmemIsValid())
+		return;
+	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
+	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
+	LWLockRelease(pagestore_shared->lock);
 }
 
 static bool
@@ -141,21 +141,20 @@ pageserver_connect(int elevel)
 
 	Assert(!connected);
 
-        if(CheckConnstringUpdated())
-        {
-            ReloadConnstring();
-        }
+	if (CheckConnstringUpdated())
+	{
+		ReloadConnstring();
+	}
 
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
 	 * variable was set, use that as the password.
 	 *
-	 * The connection options are parsed in the order they're given, so
-	 * when we set the password before the connection string, the
-	 * connection string can override the password from the env variable.
-	 * Seems useful, although we don't currently use that capability
-	 * anywhere.
+	 * The connection options are parsed in the order they're given, so when
+	 * we set the password before the connection string, the connection string
+	 * can override the password from the env variable. Seems useful, although
+	 * we don't currently use that capability anywhere.
 	 */
 	n = 0;
 	if (neon_auth_token)
@@ -198,9 +197,9 @@ pageserver_connect(int elevel)
 
 	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
 	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
-			  MyLatch, NULL);
+					  MyLatch, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-			  NULL, NULL);
+					  NULL, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
 
 	while (PQisBusy(pageserver_conn))
@@ -265,6 +264,7 @@ retry:
 			if (!PQconsumeInput(pageserver_conn))
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
 				neon_log(LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
@@ -305,15 +305,15 @@ pageserver_disconnect(void)
 }
 
 static bool
-pageserver_send(NeonRequest * request)
+pageserver_send(NeonRequest *request)
 {
 	StringInfoData req_buff;
 
-        if(CheckConnstringUpdated())
-        {
-            pageserver_disconnect();
-            ReloadConnstring();
-        }
+	if (CheckConnstringUpdated())
+	{
+		pageserver_disconnect();
+		ReloadConnstring();
+	}
 
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -326,10 +326,12 @@ pageserver_send(NeonRequest * request)
 
 	/*
 	 * If pageserver is stopped, the connections from compute node are broken.
-	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
-	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
-	 * See https://github.com/neondatabase/neon/issues/1138
-	 * So try to reestablish connection in case of failure.
+	 * The compute node doesn't notice that immediately, but it will cause the
+	 * next request to fail, usually on the next query. That causes
+	 * user-visible errors if pageserver is restarted, or the tenant is moved
+	 * from one pageserver to another. See
+	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
+	 * connection in case of failure.
 	 */
 	if (!connected)
 	{
@@ -353,6 +355,7 @@ pageserver_send(NeonRequest * request)
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
 		pageserver_disconnect();
 		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
@@ -410,7 +413,8 @@ pageserver_receive(void)
 		}
 		else if (rc == -2)
 		{
-			char* msg = pchomp(PQerrorMessage(pageserver_conn));
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
 			pageserver_disconnect();
 			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
@@ -444,6 +448,7 @@ pageserver_flush(void)
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
 			pageserver_disconnect();
 			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
@@ -471,46 +476,47 @@ check_neon_id(char **newval, void **extra, GucSource source)
 static Size
 PagestoreShmemSize(void)
 {
-    return sizeof(PagestoreShmemState);
+	return sizeof(PagestoreShmemState);
 }
 
 static bool
 PagestoreShmemInit(void)
 {
-    bool found;
-    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-    pagestore_shared = ShmemInitStruct("libpagestore shared state",
-                                       PagestoreShmemSize(),
-                                       &found);
-    if(!found)
-    {
-        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
-        AssignPageserverConnstring(page_server_connstring, NULL);
-    }
-    LWLockRelease(AddinShmemInitLock);
-    return found;
+	bool		found;
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	pagestore_shared = ShmemInitStruct("libpagestore shared state",
+									   PagestoreShmemSize(),
+									   &found);
+	if (!found)
+	{
+		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
+		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+		AssignPageserverConnstring(page_server_connstring, NULL);
+	}
+	LWLockRelease(AddinShmemInitLock);
+	return found;
 }
 
 static void
 pagestore_shmem_startup_hook(void)
 {
-    if(prev_shmem_startup_hook)
-        prev_shmem_startup_hook();
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
 
-    PagestoreShmemInit();
+	PagestoreShmemInit();
 }
 
 static void
 pagestore_shmem_request(void)
 {
 #if PG_VERSION_NUM >= 150000
-    if(prev_shmem_request_hook)
-        prev_shmem_request_hook();
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
 #endif
 
-    RequestAddinShmemSpace(PagestoreShmemSize());
-    RequestNamedLWLockTranche("neon_libpagestore", 1);
+	RequestAddinShmemSpace(PagestoreShmemSize());
+	RequestNamedLWLockTranche("neon_libpagestore", 1);
 }
 
 static void
@@ -520,7 +526,7 @@ pagestore_prepare_shmem(void)
 	prev_shmem_request_hook = shmem_request_hook;
 	shmem_request_hook = pagestore_shmem_request;
 #else
-        pagestore_shmem_request();
+	pagestore_shmem_request();
 #endif
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = pagestore_shmem_startup_hook;
@@ -532,7 +538,7 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-        pagestore_prepare_shmem();
+	pagestore_prepare_shmem();
 
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
@@ -607,7 +613,10 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;
 
-	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
+	/*
+	 * Retrieve the auth token to use when connecting to pageserver and
+	 * safekeepers
+	 */
 	neon_auth_token = getenv("NEON_AUTH_TOKEN");
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 4850b0d6a1..6a8f8cca70 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -48,9 +48,9 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
-	// Important: This must happen after other parts of the extension
-	// are loaded, otherwise any settings to GUCs that were set before
-	// the extension was loaded will be removed.
+	/* Important: This must happen after other parts of the extension */
+	/* are loaded, otherwise any settings to GUCs that were set before */
+	/* the extension was loaded will be removed. */
 	EmitWarningsOnPlaceholders("neon");
 }
 
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 3300c67456..897a8373a1 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
  * block_id; false otherwise.
  */
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
 
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 8db0d5341e..d38ef48910 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -59,7 +59,7 @@
 
 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
 
-#else /* major version >= 16 */
+#else							/* major version >= 16 */
 
 #define USE_RELFILELOCATOR
 
@@ -109,4 +109,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif
 
-#endif //NEON_PGVERSIONCOMPAT_H
+#endif	/* //NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index d61f74b5c8..ecfadb01d6 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -40,13 +40,13 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
-}			NeonMessageTag;
+} NeonMessageTag;
 
 /* base struct for c-style inheritance */
 typedef struct
 {
 	NeonMessageTag tag;
-}			NeonMessage;
+} NeonMessage;
 
 #define messageTag(m) (((const NeonMessage *)(m))->tag)
 
@@ -67,27 +67,27 @@ typedef struct
 	NeonMessageTag tag;
 	bool		latest;			/* if true, request latest page version */
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-}			NeonRequest;
+} NeonRequest;
 
 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-}			NeonExistsRequest;
+} NeonExistsRequest;
 
 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-}			NeonNblocksRequest;
+} NeonNblocksRequest;
 
 typedef struct
 {
 	NeonRequest req;
 	Oid			dbNode;
-}			NeonDbSizeRequest;
+} NeonDbSizeRequest;
 
 typedef struct
 {
@@ -95,31 +95,31 @@ typedef struct
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-}			NeonGetPageRequest;
+} NeonGetPageRequest;
 
 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
 	NeonMessageTag tag;
-}			NeonResponse;
+} NeonResponse;
 
 typedef struct
 {
 	NeonMessageTag tag;
 	bool		exists;
-}			NeonExistsResponse;
+} NeonExistsResponse;
 
 typedef struct
 {
 	NeonMessageTag tag;
 	uint32		n_blocks;
-}			NeonNblocksResponse;
+} NeonNblocksResponse;
 
 typedef struct
 {
 	NeonMessageTag tag;
 	char		page[FLEXIBLE_ARRAY_MEMBER];
-}			NeonGetPageResponse;
+} NeonGetPageResponse;
 
 #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
 
@@ -127,18 +127,18 @@ typedef struct
 {
 	NeonMessageTag tag;
 	int64		db_size;
-}			NeonDbSizeResponse;
+} NeonDbSizeResponse;
 
 typedef struct
 {
 	NeonMessageTag tag;
 	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
 												 * message */
-}			NeonErrorResponse;
+} NeonErrorResponse;
 
-extern StringInfoData nm_pack_request(NeonRequest * msg);
-extern NeonResponse * nm_unpack_response(StringInfo s);
-extern char *nm_to_string(NeonMessage * msg);
+extern StringInfoData nm_pack_request(NeonRequest *msg);
+extern NeonResponse *nm_unpack_response(StringInfo s);
+extern char *nm_to_string(NeonMessage *msg);
 
 /*
  * API
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage * msg);
 
 typedef struct
 {
-	bool		(*send) (NeonRequest * request);
+	bool		(*send) (NeonRequest *request);
 	NeonResponse *(*receive) (void);
 	bool		(*flush) (void);
-}			page_server_api;
+} page_server_api;
 
 extern void prefetch_on_ps_disconnect(void);
 
-extern page_server_api * page_server;
+extern page_server_api *page_server;
 
 extern char *page_server_connstring;
-extern int flush_every_n_requests;
-extern int readahead_buffer_size;
+extern int	flush_every_n_requests;
+extern int	readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
-extern int seqscan_prefetch_distance;
+extern int	seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern bool wal_redo;
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-							 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-							 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 74ffbdb371..c9c41cd1cc 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -101,21 +101,21 @@ typedef enum
 	UNLOGGED_BUILD_PHASE_1,
 	UNLOGGED_BUILD_PHASE_2,
 	UNLOGGED_BUILD_NOT_PERMANENT
-}			UnloggedBuildPhase;
+} UnloggedBuildPhase;
 
 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
 /*
  * Prefetch implementation:
- * 
+ *
  * Prefetch is performed locally by each backend.
  *
  * There can be up to readahead_buffer_size active IO requests registered at
  * any time. Requests using smgr_prefetch are sent to the pageserver, but we
  * don't wait on the response. Requests using smgr_read are either read from
  * the buffer, or (if that's not possible) we wait on the response to arrive -
- * this also will allow us to receive other prefetched pages. 
+ * this also will allow us to receive other prefetched pages.
  * Each request is immediately written to the output buffer of the pageserver
  * connection, but may not be flushed if smgr_prefetch is used: pageserver
  * flushes sent requests on manual flush, or every neon.flush_output_after
@@ -139,7 +139,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
 /*
  * State machine:
- *        
+ *
  * not in hash : in hash
  *             :
  * UNUSED ------> REQUESTED --> RECEIVED
@@ -150,30 +150,34 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
  *   +----------------+------------+
  *             :
  */
-typedef enum PrefetchStatus {
-	PRFS_UNUSED = 0,	/* unused slot */
-	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
-						 * necessarily flushed.
-						 * all fields except response valid */
-	PRFS_RECEIVED,		/* all fields valid */
-	PRFS_TAG_REMAINS,	/* only buftag and my_ring_index are still valid */
+typedef enum PrefetchStatus
+{
+	PRFS_UNUSED = 0,			/* unused slot */
+	PRFS_REQUESTED,				/* request was written to the sendbuffer to
+								 * PS, but not necessarily flushed. all fields
+								 * except response valid */
+	PRFS_RECEIVED,				/* all fields valid */
+	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
+								 * valid */
 } PrefetchStatus;
 
-typedef struct PrefetchRequest {
-	BufferTag	buftag; /* must be first entry in the struct */
+typedef struct PrefetchRequest
+{
+	BufferTag	buftag;			/* must be first entry in the struct */
 	XLogRecPtr	effective_request_lsn;
 	XLogRecPtr	actual_request_lsn;
-	NeonResponse *response; /* may be null */
+	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	uint64		my_ring_index;
 } PrefetchRequest;
 
 /* prefetch buffer lookup hash table */
 
-typedef struct PrfHashEntry {
+typedef struct PrfHashEntry
+{
 	PrefetchRequest *slot;
-	uint32 status;
-	uint32 hash;
+	uint32		status;
+	uint32		hash;
 } PrfHashEntry;
 
 #define SH_PREFIX			prfh
@@ -197,36 +201,42 @@ typedef struct PrfHashEntry {
 /*
  * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
  * It maintains a (ring) buffer of in-flight requests and responses.
- * 
+ *
  * We maintain several indexes into the ring buffer:
  * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
- * 
+ *
  * ring_unused points to the first unused slot of the buffer
  * ring_receive is the next request that is to be received
  * ring_last is the oldest received entry in the buffer
- * 
+ *
  * Apart from being an entry in the ring buffer of prefetch requests, each
  * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
  */
-typedef struct PrefetchState {
-	MemoryContext bufctx; /* context for prf_buffer[].response allocations */
-	MemoryContext errctx; /* context for prf_buffer[].response allocations */
-	MemoryContext hashctx; /* context for prf_buffer */
+typedef struct PrefetchState
+{
+	MemoryContext bufctx;		/* context for prf_buffer[].response
+								 * allocations */
+	MemoryContext errctx;		/* context for prf_buffer[].response
+								 * allocations */
+	MemoryContext hashctx;		/* context for prf_buffer */
 
 	/* buffer indexes */
-	uint64	ring_unused;		/* first unused slot */
-	uint64	ring_flush;			/* next request to flush */
-	uint64	ring_receive;		/* next slot that is to receive a response */
-	uint64	ring_last;			/* min slot with a response value */
+	uint64		ring_unused;	/* first unused slot */
+	uint64		ring_flush;		/* next request to flush */
+	uint64		ring_receive;	/* next slot that is to receive a response */
+	uint64		ring_last;		/* min slot with a response value */
 
 	/* metrics / statistics  */
-	int		n_responses_buffered;	/* count of PS responses not yet in buffers */
-	int		n_requests_inflight;	/* count of PS requests considered in flight */
-	int		n_unused;				/* count of buffers < unused, > last, that are also unused */
+	int			n_responses_buffered;	/* count of PS responses not yet in
+										 * buffers */
+	int			n_requests_inflight;	/* count of PS requests considered in
+										 * flight */
+	int			n_unused;		/* count of buffers < unused, > last, that are
+								 * also unused */
 
 	/* the buffers */
-	prfh_hash *prf_hash;
-	PrefetchRequest prf_buffer[]; /* prefetch buffers */
+	prfh_hash  *prf_hash;
+	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
 PrefetchState *MyPState;
@@ -264,10 +274,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 static bool
 compact_prefetch_buffers(void)
 {
-	uint64	empty_ring_index = MyPState->ring_last;
-	uint64	search_ring_index = MyPState->ring_receive;
-	int n_moved = 0;
-	
+	uint64		empty_ring_index = MyPState->ring_last;
+	uint64		search_ring_index = MyPState->ring_receive;
+	int			n_moved = 0;
+
 	if (MyPState->ring_receive == MyPState->ring_last)
 		return false;
 
@@ -282,15 +292,14 @@ compact_prefetch_buffers(void)
 	}
 
 	/*
-	 * Here we have established:
-	 *   slots < search_ring_index have an unknown state (not scanned)
-	 *   slots >= search_ring_index and <= empty_ring_index are unused
-	 *   slots > empty_ring_index are in use, or outside our buffer's range.
-	 * ... unless search_ring_index <= ring_last
-	 * 
+	 * Here we have established: slots < search_ring_index have an unknown
+	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
+	 * are unused slots > empty_ring_index are in use, or outside our buffer's
+	 * range. ... unless search_ring_index <= ring_last
+	 *
 	 * Therefore, there is a gap of at least one unused items between
-	 * search_ring_index and empty_ring_index (both inclusive), which grows as we hit
-	 * more unused items while moving backwards through the array.
+	 * search_ring_index and empty_ring_index (both inclusive), which grows as
+	 * we hit more unused items while moving backwards through the array.
 	 */
 
 	while (search_ring_index > MyPState->ring_last)
@@ -330,7 +339,10 @@ compact_prefetch_buffers(void)
 
 		/* empty the moved slot */
 		source_slot->status = PRFS_UNUSED;
-		source_slot->buftag = (BufferTag) {0};
+		source_slot->buftag = (BufferTag)
+		{
+			0
+		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
 		source_slot->effective_request_lsn = 0;
@@ -340,8 +352,8 @@ compact_prefetch_buffers(void)
 	}
 
 	/*
-	 * Only when we've moved slots we can expect trailing unused slots,
-	 * so only then we clean up trailing unused slots.
+	 * Only when we've moved slots we can expect trailing unused slots, so
+	 * only then we clean up trailing unused slots.
 	 */
 	if (n_moved > 0)
 	{
@@ -358,10 +370,10 @@ readahead_buffer_resize(int newsize, void *extra)
 	uint64		end,
 				nfree = newsize;
 	PrefetchState *newPState;
-	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
-		sizeof(PrefetchRequest) * newsize
-	);
-	
+	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+																	  sizeof(PrefetchRequest) * newsize
+		);
+
 	/* don't try to re-initialize if we haven't initialized yet */
 	if (MyPState == NULL)
 		return;
@@ -388,12 +400,12 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_receive = newsize;
 	newPState->ring_flush = newsize;
 
-	/* 
+	/*
 	 * Copy over the prefetches.
-	 * 
+	 *
 	 * We populate the prefetch array from the end; to retain the most recent
-	 * prefetches, but this has the benefit of only needing to do one iteration
-	 * on the dataset, and trivial compaction.
+	 * prefetches, but this has the benefit of only needing to do one
+	 * iteration on the dataset, and trivial compaction.
 	 */
 	for (end = MyPState->ring_unused - 1;
 		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
@@ -401,7 +413,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
 		PrefetchRequest *newslot;
-		bool	found;
+		bool		found;
 
 		if (slot->status == PRFS_UNUSED)
 			continue;
@@ -464,10 +476,11 @@ consume_prefetch_responses(void)
 static void
 prefetch_cleanup_trailing_unused(void)
 {
-	uint64	ring_index;
+	uint64		ring_index;
 	PrefetchRequest *slot;
 
-	while (MyPState->ring_last < MyPState->ring_receive) {
+	while (MyPState->ring_last < MyPState->ring_receive)
+	{
 		ring_index = MyPState->ring_last;
 		slot = GetPrfSlot(ring_index);
 
@@ -481,7 +494,7 @@ prefetch_cleanup_trailing_unused(void)
 /*
  * Wait for slot of ring_index to have received its response.
  * The caller is responsible for making sure the request buffer is flushed.
- * 
+ *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
@@ -513,7 +526,7 @@ prefetch_wait_for(uint64 ring_index)
 
 /*
  * Read the response of a prefetch request into its slot.
- * 
+ *
  * The caller is responsible for making sure that the request for this buffer
  * was flushed to the PageServer.
  *
@@ -553,7 +566,7 @@ prefetch_read(PrefetchRequest *slot)
 
 /*
  * Disconnect hook - drop prefetches when the connection drops
- * 
+ *
  * If we don't remove the failed prefetches, we'd be serving incorrect
  * data to the smgr.
  */
@@ -564,7 +577,7 @@ prefetch_on_ps_disconnect(void)
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
-		uint64 ring_index = MyPState->ring_receive;
+		uint64		ring_index = MyPState->ring_receive;
 
 		slot = GetPrfSlot(ring_index);
 
@@ -594,7 +607,7 @@ prefetch_set_unused(uint64 ring_index)
 	PrefetchRequest *slot = GetPrfSlot(ring_index);
 
 	if (ring_index < MyPState->ring_last)
-		return; /* Should already be unused */
+		return;					/* Should already be unused */
 
 	Assert(MyPState->ring_unused > ring_index);
 
@@ -625,7 +638,11 @@ prefetch_set_unused(uint64 ring_index)
 	/* run cleanup if we're holding back ring_last */
 	if (MyPState->ring_last == ring_index)
 		prefetch_cleanup_trailing_unused();
-	/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
+
+	/*
+	 * ... and try to store the buffered responses more compactly if > 12.5%
+	 * of the buffer is gaps
+	 */
 	else if (ReceiveBufferNeedsCompaction())
 		compact_prefetch_buffers();
 }
@@ -633,7 +650,7 @@ prefetch_set_unused(uint64 ring_index)
 static void
 prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	bool found;
+	bool		found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
@@ -651,21 +668,22 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	}
 	else
 	{
-		XLogRecPtr lsn = neon_get_request_lsn(
-			&request.req.latest,
-			BufTagGetNRelFileInfo(slot->buftag),
-			slot->buftag.forkNum,
-			slot->buftag.blockNum
-		);
+		XLogRecPtr	lsn = neon_get_request_lsn(
+											   &request.req.latest,
+											   BufTagGetNRelFileInfo(slot->buftag),
+											   slot->buftag.forkNum,
+											   slot->buftag.blockNum
+			);
+
 		/*
-		 * Note: effective_request_lsn is potentially higher than the requested
-		 * LSN, but still correct:
-		 * 
+		 * Note: effective_request_lsn is potentially higher than the
+		 * requested LSN, but still correct:
+		 *
 		 * We know there are no changes between the actual requested LSN and
 		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which
-		 * then would have had to result in a larger request LSN for this page.
-		 * 
+		 * have been in cache and evicted between those LSN values, which then
+		 * would have had to result in a larger request LSN for this page.
+		 *
 		 * It is possible that a concurrent backend loads the page, modifies
 		 * it and then evicts it again, but the LSN of that eviction cannot be
 		 * smaller than the current WAL insert/redo pointer, which is already
@@ -702,7 +720,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
  * prefetch_register_buffer() - register and prefetch buffer
  *
  * Register that we may want the contents of BufferTag in the near future.
- * 
+ *
  * If force_latest and force_lsn are not NULL, those values are sent to the
  * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
  * to fill in these values manually.
@@ -714,14 +732,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 static uint64
 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	uint64	ring_index;
+	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-  Retry:
+Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
 
 	if (entry != NULL)
@@ -741,7 +759,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 		 */
 		if (force_latest && force_lsn)
 		{
-			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
+			/*
+			 * if we want the latest version, any effective_request_lsn <
+			 * request lsn is OK
+			 */
 			if (*force_latest)
 			{
 				if (*force_lsn > slot->effective_request_lsn)
@@ -752,7 +773,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 				}
 
 			}
-			/* if we don't want the latest version, only accept requests with the exact same LSN */
+
+			/*
+			 * if we don't want the latest version, only accept requests with
+			 * the exact same LSN
+			 */
 			else
 			{
 				if (*force_lsn != slot->effective_request_lsn)
@@ -799,7 +824,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	 */
 	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
 	{
-		uint64 cleanup_index = MyPState->ring_last;
+		uint64		cleanup_index = MyPState->ring_last;
+
 		slot = GetPrfSlot(cleanup_index);
 
 		Assert(slot->status != PRFS_UNUSED);
@@ -814,7 +840,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 		}
 		else
 		{
-			/* We have the slot for ring_last, so that must still be in progress */
+			/*
+			 * We have the slot for ring_last, so that must still be in
+			 * progress
+			 */
 			switch (slot->status)
 			{
 				case PRFS_REQUESTED:
@@ -833,8 +862,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	}
 
 	/*
-	 * The next buffer pointed to by `ring_unused` is now definitely empty,
-	 * so we can insert the new request to it.
+	 * The next buffer pointed to by `ring_unused` is now definitely empty, so
+	 * we can insert the new request to it.
 	 */
 	ring_index = MyPState->ring_unused;
 	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
@@ -860,7 +889,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	{
 		if (!page_server->flush())
 		{
-			/* Prefetch set is reset in case of error, so we should try to register our request once again */
+			/*
+			 * Prefetch set is reset in case of error, so we should try to
+			 * register our request once again
+			 */
 			goto Retry;
 		}
 		MyPState->ring_flush = MyPState->ring_unused;
@@ -872,8 +904,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 static NeonResponse *
 page_server_request(void const *req)
 {
-	NeonResponse* resp;
-	do {
+	NeonResponse *resp;
+
+	do
+	{
 		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
@@ -885,7 +919,7 @@ page_server_request(void const *req)
 
 
 StringInfoData
-nm_pack_request(NeonRequest * msg)
+nm_pack_request(NeonRequest *msg)
 {
 	StringInfoData s;
 
@@ -1001,7 +1035,7 @@ nm_unpack_response(StringInfo s)
 				/* XXX:	should be varlena */
 				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
 				pq_getmsgend(s);
-				
+
 				Assert(msg_resp->tag == T_NeonGetPageResponse);
 
 				resp = (NeonResponse *) msg_resp;
@@ -1057,7 +1091,7 @@ nm_unpack_response(StringInfo s)
 
 /* dump to json for debugging / error reporting purposes */
 char *
-nm_to_string(NeonMessage * msg)
+nm_to_string(NeonMessage *msg)
 {
 	StringInfoData s;
 
@@ -1186,7 +1220,7 @@ nm_to_string(NeonMessage * msg)
  * directly because it skips the logging if the LSN is new enough.
  */
 static XLogRecPtr
-log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
+log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 Page page, bool page_std)
 {
 	PGAlignedBlock copied_buffer;
@@ -1209,11 +1243,11 @@ PageIsEmptyHeapPage(char *buffer)
 }
 
 static void
-neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-				 char *buffer, bool force)
+							 char *buffer, bool force)
 #else
-				 const char *buffer, bool force) 
+							 const char *buffer, bool force)
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
@@ -1313,24 +1347,24 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 void
 neon_init(void)
 {
-	Size prfs_size;
+	Size		prfs_size;
 
 	if (MyPState != NULL)
 		return;
 
 	prfs_size = offsetof(PrefetchState, prf_buffer) + (
-		sizeof(PrefetchRequest) * readahead_buffer_size
-	);
+													   sizeof(PrefetchRequest) * readahead_buffer_size
+		);
 
 	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
-	
+
 	MyPState->n_unused = readahead_buffer_size;
 
 	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
 										 "NeonSMGR/prefetch",
 										 SLAB_DEFAULT_BLOCK_SIZE * 17,
 										 PS_GETPAGERESPONSE_SIZE);
-	MyPState->errctx = AllocSetContextCreate(TopMemoryContext, 
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
 											 "NeonSMGR/errors",
 											 ALLOCSET_DEFAULT_SIZES);
 	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
@@ -1570,14 +1604,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
-	 * Note that in REDO, this is called to make sure the relation fork exists,
-	 * but it does not truncate the relation. So, we can only update the
-	 * relsize if it didn't exist before.
-	 * 
+	 * Note that in REDO, this is called to make sure the relation fork
+	 * exists, but it does not truncate the relation. So, we can only update
+	 * the relsize if it didn't exist before.
+	 *
 	 * Also, in redo, we must make sure to update the cached size of the
-	 * relation, as that is the primary source of truth for REDO's
-	 * file length considerations, and as file extension isn't (perfectly)
-	 * logged, we need to take care of that before we hit file size checks.
+	 * relation, as that is the primary source of truth for REDO's file length
+	 * considerations, and as file extension isn't (perfectly) logged, we need
+	 * to take care of that before we hit file size checks.
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
@@ -1653,7 +1687,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #endif
 {
 	XLogRecPtr	lsn;
-	BlockNumber	n_blocks = 0;
+	BlockNumber n_blocks = 0;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1694,9 +1728,10 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	/*
-	 * Usually Postgres doesn't extend relation on more than one page
-	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
-	 * call smgrextend for destination relation n using size of source relation
+	 * Usually Postgres doesn't extend relation on more than one page (leaving
+	 * holes). But this rule is violated in PG-15 where
+	 * CreateAndCopyRelationData call smgrextend for destination relation n
+	 * using size of source relation
 	 */
 	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
@@ -1717,11 +1752,13 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
+
 	/*
-	 * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
-	 * An smgr_write() call will come for the buffer later, after it has been initialized
-	 * with the real page contents, and it is eventually evicted from the buffer cache.
-	 * But we need a valid LSN to the relation metadata update now.
+	 * smgr_extend is often called with an all-zeroes page, so
+	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
+	 * later, after it has been initialized with the real page contents, and
+	 * it is eventually evicted from the buffer cache. But we need a valid LSN
+	 * to the relation metadata update now.
 	 */
 	if (lsn == InvalidXLogRecPtr)
 	{
@@ -1780,9 +1817,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-					errmsg("cannot extend file \"%s\" beyond %u blocks",
-						   relpath(reln->smgr_rlocator, forkNum),
-						   InvalidBlockNumber)));
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rlocator, forkNum),
+						InvalidBlockNumber)));
 
 	/* Don't log any pages if we're not allowed to do so. */
 	if (!XLogInsertAllowed())
@@ -1869,7 +1906,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	switch (reln->smgr_relpersistence)
 	{
-		case 0: /* probably shouldn't happen, but ignore it */
+		case 0:					/* probably shouldn't happen, but ignore it */
 		case RELPERSISTENCE_PERMANENT:
 			break;
 
@@ -1884,9 +1921,10 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
 		return false;
 
-	tag = (BufferTag) {
+	tag = (BufferTag)
+	{
 		.forkNum = forknum,
-		.blockNum = blocknum
+			.blockNum = blocknum
 	};
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
@@ -1941,11 +1979,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
  * To avoid breaking tests in the runtime please keep function signature in sync.
  */
 #if PG_MAJORVERSION_NUM < 16
-void PGDLLEXPORT
+void		PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 #else
-void PGDLLEXPORT
+void		PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
 #endif
@@ -1956,21 +1994,21 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;
 
-	buftag = (BufferTag) {
+	buftag = (BufferTag)
+	{
 		.forkNum = forkNum,
-		.blockNum = blkno,
+			.blockNum = blkno,
 	};
 
 	CopyNRelFileInfoToBufTag(buftag, rinfo);
 
 	/*
 	 * The redo process does not lock pages that it needs to replay but are
-	 * not in the shared buffers, so a concurrent process may request the
-	 * page after redo has decided it won't redo that page and updated the
-	 * LwLSN for that page.
-	 * If we're in hot standby we need to take care that we don't return
-	 * until after REDO has finished replaying up to that LwLSN, as the page
-	 * should have been locked up to that point.
+	 * not in the shared buffers, so a concurrent process may request the page
+	 * after redo has decided it won't redo that page and updated the LwLSN
+	 * for that page. If we're in hot standby we need to take care that we
+	 * don't return until after REDO has finished replaying up to that LwLSN,
+	 * as the page should have been locked up to that point.
 	 *
 	 * See also the description on neon_redo_read_buffer_filter below.
 	 *
@@ -1978,7 +2016,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * concurrent failed read IOs. Those IOs should never have a request_lsn
 	 * that is as large as the WAL record we're currently replaying, if it
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
-	 * value of the LwLsn cache when the entry is not found. 
+	 * value of the LwLsn cache when the entry is not found.
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
 		XLogWaitForReplayOf(request_lsn);
@@ -1996,12 +2034,14 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else /* the current prefetch LSN is not large enough, so drop the prefetch */
+		else					/* the current prefetch LSN is not large
+								 * enough, so drop the prefetch */
 		{
 			/*
 			 * We can't drop cache for not-yet-received requested items. It is
-			 * unlikely this happens, but it can happen if prefetch distance is
-			 * large enough and a backend didn't consume all prefetch requests.
+			 * unlikely this happens, but it can happen if prefetch distance
+			 * is large enough and a backend didn't consume all prefetch
+			 * requests.
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
@@ -2028,11 +2068,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		else
 		{
 			/*
-			 * Empty our reference to the prefetch buffer's hash entry.
-			 * When we wait for prefetches, the entry reference is invalidated by 
-			 * potential updates to the hash, and when we reconnect to the 
-			 * pageserver the prefetch we're waiting for may be dropped,
-			 * in which case we need to retry and take the branch above.
+			 * Empty our reference to the prefetch buffer's hash entry. When
+			 * we wait for prefetches, the entry reference is invalidated by
+			 * potential updates to the hash, and when we reconnect to the
+			 * pageserver the prefetch we're waiting for may be dropped, in
+			 * which case we need to retry and take the branch above.
 			 */
 			entry = NULL;
 		}
@@ -2080,11 +2120,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
  *	neon_read() -- Read the specified block from a relation.
  */
 void
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-		  char *buffer)
+					  char *buffer)
 #else
-		  void *buffer)
+					  void *buffer)
 #endif
 {
 	bool		latest;
@@ -2219,11 +2259,11 @@ hexdump_page(char *page)
  *		use mdextend().
  */
 void
-neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-		   char *buffer, bool skipFsync)
+					   char *buffer, bool skipFsync)
 #else
-		   const void *buffer, bool skipFsync)
+					   const void *buffer, bool skipFsync)
 #endif
 {
 	XLogRecPtr	lsn;
@@ -2727,6 +2767,7 @@ static void
 neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
 {
 	BlockNumber relsize;
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -2739,11 +2780,11 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	else
 	{
 		/*
-		 * Size was not cached. We populate the cache now, with the size of the
-		 * relation measured after this WAL record is applied.
+		 * Size was not cached. We populate the cache now, with the size of
+		 * the relation measured after this WAL record is applied.
 		 *
-		 * This length is later reused when we open the smgr to read the block,
-		 * which is fine and expected.
+		 * This length is later reused when we open the smgr to read the
+		 * block, which is fine and expected.
 		 */
 
 		NeonResponse *response;
@@ -2763,7 +2804,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		Assert(response->tag == T_NeonNblocksResponse);
 		nbresponse = (NeonNblocksResponse *) response;
 
-		relsize = Max(nbresponse->n_blocks, blkno+1);
+		relsize = Max(nbresponse->n_blocks, blkno + 1);
 
 		set_cached_relsize(rinfo, forknum, relsize);
 		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
@@ -2805,7 +2846,7 @@ get_fsm_physical_block(BlockNumber heapblk)
 
 /*
  * Return whether we can skip the redo for this block.
- * 
+ *
  * The conditions for skipping the IO are:
  *
  * - The block is not in the shared buffers, and
@@ -2844,7 +2885,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	XLogRecPtr	end_recptr = record->EndRecPtr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-	BlockNumber	blkno;
+	BlockNumber blkno;
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
@@ -2863,8 +2904,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 
 	/*
 	 * Out of an abundance of caution, we always run redo on shared catalogs,
-	 * regardless of whether the block is stored in shared buffers.
-	 * See also this function's top comment.
+	 * regardless of whether the block is stored in shared buffers. See also
+	 * this function's top comment.
 	 */
 	if (!OidIsValid(NInfoGetDbOid(rinfo)))
 		return false;
@@ -2890,8 +2931,9 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 
-	/* we don't have the buffer in memory, update lwLsn past this record,
-	 * also evict page fro file cache
+	/*
+	 * we don't have the buffer in memory, update lwLsn past this record, also
+	 * evict page fro file cache
 	 */
 	if (no_redo_needed)
 		lfc_evict(rinfo, forknum, blkno);
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 10544ba7a8..7d9dbfdb7f 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -178,7 +178,7 @@ WalProposerFree(WalProposer *wp)
 	if (wp->propTermHistory.entries != NULL)
 		pfree(wp->propTermHistory.entries);
 	wp->propTermHistory.entries = NULL;
-	
+
 	pfree(wp);
 }
 
@@ -275,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-						 sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
+								sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -395,7 +395,7 @@ ResetConnection(Safekeeper *sk)
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
 		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-			 sk->host, sk->port, wp->api.conn_error_message(sk));
+					sk->host, sk->port, wp->api.conn_error_message(sk));
 
 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -489,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_OFFLINE:
 			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-				 sk->host, sk->port);
+						sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */
 
@@ -525,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;
 
@@ -554,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;
 
@@ -580,7 +580,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	{
 		case WP_CONN_POLLING_OK:
 			walprop_log(LOG, "connected with node %s:%s", sk->host,
-				 sk->port);
+						sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
 
 			/*
@@ -604,7 +604,7 @@ HandleConnectionEvent(Safekeeper *sk)
 
 		case WP_CONN_POLLING_FAILED:
 			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-				 sk->host, sk->port, wp->api.conn_error_message(sk));
+						sk->host, sk->port, wp->api.conn_error_message(sk));
 
 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -641,7 +641,7 @@ SendStartWALPush(Safekeeper *sk)
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
 		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			 sk->host, sk->port, wp->api.conn_error_message(sk));
+					sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -678,7 +678,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 
 		case WP_EXEC_FAILED:
 			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-				 sk->host, sk->port, wp->api.conn_error_message(sk));
+						sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;
 
@@ -689,7 +689,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
 			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-				 sk->host, sk->port);
+						sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -758,8 +758,8 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	{
 		/* Another compute with higher term is running. */
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-			 sk->host, sk->port,
-			 sk->greetResponse.term, wp->propTerm);
+					sk->host, sk->port,
+					sk->greetResponse.term, wp->propTerm);
 	}
 
 	/*
@@ -817,11 +817,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	walprop_log(LOG,
-		 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-		 sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-		 LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-		 LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-		 LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -832,8 +832,8 @@ RecvVoteResponse(Safekeeper *sk)
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-			 sk->host, sk->port,
-			 sk->voteResponse.term, wp->propTerm);
+					sk->host, sk->port,
+					sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);
 
@@ -877,10 +877,10 @@ HandleElectedProposer(WalProposer *wp)
 	if (wp->truncateLsn < wp->propEpochStartLsn)
 	{
 		walprop_log(LOG,
-			 "start recovery because truncateLsn=%X/%X is not "
-			 "equal to epochStartLsn=%X/%X",
-			 LSN_FORMAT_ARGS(wp->truncateLsn),
-			 LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+					"start recovery because truncateLsn=%X/%X is not "
+					"equal to epochStartLsn=%X/%X",
+					LSN_FORMAT_ARGS(wp->truncateLsn),
+					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 		/* Perform recovery */
 		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
 			walprop_log(FATAL, "Failed to recover state");
@@ -990,9 +990,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
 					walprop_log(WARNING,
-						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-						 LSN_FORMAT_ARGS(wp->timelineStartLsn),
-						 LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+								LSN_FORMAT_ARGS(wp->timelineStartLsn),
+								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -1038,11 +1038,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
 
 	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		 wp->quorum,
-		 wp->propTerm,
-		 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-		 wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-		 LSN_FORMAT_ARGS(wp->truncateLsn));
+				wp->quorum,
+				wp->propTerm,
+				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+				LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1070,18 +1070,18 @@ DetermineEpochStartLsn(WalProposer *wp)
 											walprop_shared->mineLastElectedTerm)))
 			{
 				walprop_log(PANIC,
-					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-					 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-					 LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}
 
 	/*
-	 * WalProposer has just elected itself and initialized history, so
-	 * we can call election callback. Usually it updates truncateLsn to
-	 * fetch WAL for logical replication.
+	 * WalProposer has just elected itself and initialized history, so we can
+	 * call election callback. Usually it updates truncateLsn to fetch WAL for
+	 * logical replication.
 	 */
 	wp->api.after_election(wp);
 }
@@ -1155,8 +1155,8 @@ SendProposerElected(Safekeeper *sk)
 			sk->startStreamingAt = wp->truncateLsn;
 
 			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-				 sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-				 LSN_FORMAT_ARGS(sk->startStreamingAt));
+						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
+						LSN_FORMAT_ARGS(sk->startStreamingAt));
 		}
 	}
 	else
@@ -1190,8 +1190,8 @@ SendProposerElected(Safekeeper *sk)
 
 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
 	walprop_log(LOG,
-		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
 
 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1355,11 +1355,11 @@ SendAppendRequests(Safekeeper *sk)
 		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
 
 		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						req->endLsn - req->beginLsn,
-						LSN_FORMAT_ARGS(req->beginLsn),
-						LSN_FORMAT_ARGS(req->endLsn),
-						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+					req->endLsn - req->beginLsn,
+					LSN_FORMAT_ARGS(req->beginLsn),
+					LSN_FORMAT_ARGS(req->endLsn),
+					LSN_FORMAT_ARGS(req->commitLsn),
+					LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
 
 		resetStringInfo(&sk->outbuf);
 
@@ -1398,8 +1398,8 @@ SendAppendRequests(Safekeeper *sk)
 
 			case PG_ASYNC_WRITE_FAIL:
 				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					 sk->host, sk->port, FormatSafekeeperState(sk->state),
-					 wp->api.conn_error_message(sk));
+							sk->host, sk->port, FormatSafekeeperState(sk->state),
+							wp->api.conn_error_message(sk));
 				ShutdownConnection(sk);
 				return false;
 			default:
@@ -1438,17 +1438,17 @@ RecvAppendResponses(Safekeeper *sk)
 			break;
 
 		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-						sk->appendResponse.term,
-						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-						sk->host, sk->port);
+					sk->appendResponse.term,
+					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+					sk->host, sk->port);
 
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/* Another compute with higher term is running. */
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-				 sk->host, sk->port,
-				 sk->appendResponse.term, wp->propTerm);
+						sk->host, sk->port,
+						sk->appendResponse.term, wp->propTerm);
 		}
 
 		readAnything = true;
@@ -1493,7 +1493,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				 rf->currentClusterSize);
+						rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
@@ -1501,7 +1501,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->last_received_lsn));
+						LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
@@ -1509,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
@@ -1517,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1530,7 +1530,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
 				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					 rf->replytime, replyTimeStr);
+							rf->replytime, replyTimeStr);
 
 				pfree(replyTimeStr);
 			}
@@ -1700,8 +1700,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 
 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-				 sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk));
+						sk->port, FormatSafekeeperState(sk->state),
+						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1740,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-			 sk->port, FormatSafekeeperState(sk->state));
+					sk->port, FormatSafekeeperState(sk->state));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1816,8 +1816,8 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-			 sk->host, sk->port, FormatSafekeeperState(sk->state),
-			 wp->api.conn_error_message(sk));
+					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1863,8 +1863,8 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk));
+						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1902,8 +1902,8 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk));
+						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -2008,7 +2008,7 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2111,7 +2111,7 @@ FormatEvents(WalProposer *wp, uint32 events)
 	if (events & (~all_flags))
 	{
 		walprop_log(WARNING, "Event formatting found unexpected component %d",
-			 events & (~all_flags));
+					events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 664aeedfa7..ae7812e710 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -356,7 +356,8 @@ typedef struct Safekeeper
 
 
 	/* postgres-specific fields */
-	#ifndef WALPROPOSER_LIB
+#ifndef WALPROPOSER_LIB
+
 	/*
 	 * postgres protocol connection to the WAL acceptor
 	 *
@@ -374,17 +375,18 @@ typedef struct Safekeeper
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
-	#endif
+#endif
 
 
 	/* WalProposer library specifics */
-	#ifdef WALPROPOSER_LIB
+#ifdef WALPROPOSER_LIB
+
 	/*
 	 * Buffer for incoming messages. Usually Rust vector is stored here.
 	 * Caller is responsible for freeing the buffer.
 	 */
 	StringInfoData inbuf;
-	#endif
+#endif
 } Safekeeper;
 
 /* Re-exported PostgresPollingStatusType */
@@ -472,7 +474,7 @@ typedef struct walproposer_api
 	WalProposerConnStatusType (*conn_status) (Safekeeper *sk);
 
 	/* Start the connection, aka PQconnectStart. */
-	void (*conn_connect_start) (Safekeeper *sk);
+	void		(*conn_connect_start) (Safekeeper *sk);
 
 	/* Poll an asynchronous connection, aka PQconnectPoll. */
 	WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
@@ -490,7 +492,7 @@ typedef struct walproposer_api
 	void		(*conn_finish) (Safekeeper *sk);
 
 	/*
-	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData. 
+	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData.
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
@@ -510,7 +512,7 @@ typedef struct walproposer_api
 	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
 
 	/* Allocate WAL reader. */
-	void (*wal_reader_allocate) (Safekeeper *sk);
+	void		(*wal_reader_allocate) (Safekeeper *sk);
 
 	/* Deallocate event set. */
 	void		(*free_event_set) (WalProposer *wp);
@@ -572,7 +574,7 @@ typedef struct walproposer_api
 	/*
 	 * Called right after the proposer was elected, but before it started
 	 * recovery and sent ProposerElected message to the safekeepers.
-	 * 
+	 *
 	 * Used by logical replication to update truncateLsn.
 	 */
 	void		(*after_election) (WalProposer *wp);
@@ -626,10 +628,10 @@ typedef struct WalProposerConfig
 	uint64		systemId;
 
 	/* Will be passed to safekeepers in greet request. */
-	TimeLineID  pgTimeline;
+	TimeLineID	pgTimeline;
 
 #ifdef WALPROPOSER_LIB
-	void *callback_data;
+	void	   *callback_data;
 #endif
 } WalProposerConfig;
 
@@ -710,10 +712,11 @@ extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);
 
 
-#define WPEVENT		1337	/* special log level for walproposer internal events */
+#define WPEVENT		1337		/* special log level for walproposer internal
+								 * events */
 
 #ifdef WALPROPOSER_LIB
-void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
+void		WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
 #define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
 #define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c
index 7617f21a26..04b519ab15 100644
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -9,8 +9,9 @@
 #include "utils/datetime.h"
 #include "miscadmin.h"
 
-void ExceptionalCondition(const char *conditionName,
-						  const char *fileName, int lineNumber)
+void
+ExceptionalCondition(const char *conditionName,
+					 const char *fileName, int lineNumber)
 {
 	fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
 			fileName, lineNumber, conditionName);
@@ -169,17 +170,18 @@ timestamptz_to_str(TimestampTz t)
 
 bool
 TimestampDifferenceExceeds(TimestampTz start_time,
-								TimestampTz stop_time,
-								int msec)
+						   TimestampTz stop_time,
+						   int msec)
 {
 	TimestampTz diff = stop_time - start_time;
+
 	return (diff >= msec * INT64CONST(1000));
 }
 
 void
-WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
+WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
 {
-	char buf[1024];
+	char		buf[1024];
 	va_list		args;
 
 	fmt = _(fmt);
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index f83a08d407..551d56d416 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -637,8 +637,8 @@ walprop_connect_start(Safekeeper *sk)
 	 */
 	sk->conn = palloc(sizeof(WalProposerConn));
 	sk->conn->pg_conn = pg_conn;
-	sk->conn->is_nonblocking = false;	/* connections always start in blocking
-									 * mode */
+	sk->conn->is_nonblocking = false;	/* connections always start in
+										 * blocking mode */
 	sk->conn->recvbuf = NULL;
 }
 
@@ -1291,10 +1291,11 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
 	/*
 	 * Apart from walproposer, basebackup LSN page is also written out by
 	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL buffers
-	 * here to avoid dummy page overwriting correct one we download here. Ugly,
-	 * but alternatives are about the same ugly. We won't need that if we switch
-	 * to on-demand WAL download from safekeepers, without writing to disk.
+	 * inherently dummy (only safekeepers have historic WAL). Update WAL
+	 * buffers here to avoid dummy page overwriting correct one we download
+	 * here. Ugly, but alternatives are about the same ugly. We won't need
+	 * that if we switch to on-demand WAL download from safekeepers, without
+	 * writing to disk.
 	 *
 	 * https://github.com/neondatabase/neon/issues/5749
 	 */
@@ -1681,17 +1682,17 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 static void
 walprop_pg_after_election(WalProposer *wp)
 {
-	FILE* f;
-	XLogRecPtr lrRestartLsn;
+	FILE	   *f;
+	XLogRecPtr	lrRestartLsn;
 
-	/* We don't need to do anything in syncSafekeepers mode.*/
+	/* We don't need to do anything in syncSafekeepers mode. */
 	if (wp->config->syncSafekeepers)
 		return;
 
 	/*
-	 * If there are active logical replication subscription we need
-	 * to provide enough WAL for their WAL senders based on th position
-	 * of their replication slots.
+	 * If there are active logical replication subscription we need to provide
+	 * enough WAL for their WAL senders based on th position of their
+	 * replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
 	if (f != NULL && !wp->config->syncSafekeepers)
@@ -1700,8 +1701,12 @@ walprop_pg_after_election(WalProposer *wp)
 		fclose(f);
 		if (lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X",  LSN_FORMAT_ARGS(lrRestartLsn));
-			/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
+			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+
+			/*
+			 * start from the beginning of the segment to fetch page headers
+			 * verifed by XLogReader
+			 */
 			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
 			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
 		}

From 193e60e2b867e2d08b919ee1e551739cc6a20fd2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 7 Dec 2023 17:46:29 +0300
Subject: [PATCH 054/202] Fix/edit pgindent confusing places in neon.

---
 pgxn/neon/extension_server.c     | 19 +++++++-------
 pgxn/neon/file_cache.c           | 27 ++++++++++++--------
 pgxn/neon/neon.c                 |  8 +++---
 pgxn/neon/neon_pgversioncompat.h |  2 +-
 pgxn/neon/pagestore_smgr.c       | 44 ++++++++++++--------------------
 pgxn/neon/walproposer.h          |  2 +-
 6 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 597eed8db5..fbbb8fd448 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -29,15 +29,16 @@ static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
 
-/*  to download all SQL (and data) files for an extension: */
-/*  curl -X POST http://localhost:8080/extension_server/postgis */
-/*  it covers two possible extension files layouts: */
-/*  1. extension_name--version--platform.sql */
-/*  2. extension_name/extension_name--version.sql */
-/*     extension_name/extra_files.csv */
-/*  */
-/*  to download specific library file: */
-/*  curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true */
+/*
+  * to download all SQL (and data) files for an extension:
+  * curl -X POST http://localhost:8080/extension_server/postgis
+  * it covers two possible extension files layouts:
+  * 1. extension_name--version--platform.sql
+  * 2. extension_name/extension_name--version.sql
+  *    extension_name/extra_files.csv
+  * to download specific library file:
+  * curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+  */
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 991b553b10..54b3661e66 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -219,12 +219,12 @@ lfc_shmem_startup(void)
 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
-		lfc_hash = ShmemInitHash("lfc_hash",
 
 		/*
 		 * lfc_size+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
+		lfc_hash = ShmemInitHash("lfc_hash",
 								 lfc_size + 1, lfc_size + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
@@ -576,11 +576,10 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
  * If cache is full then evict some other page.
  */
 void
-			lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-					  char *buffer)
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
 #else
-					  const void *buffer)
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
 #endif
 {
 	BufferTag	tag;
@@ -626,13 +625,19 @@ void
 	{
 		/*
 		 * We have two choices if all cache pages are pinned (i.e. used in IO
-		 * operations): 1. Wait until some of this operation is completed and
-		 * pages is unpinned 2. Allocate one more chunk, so that specified
-		 * cache size is more recommendation than hard limit. As far as
-		 * probability of such event (that all pages are pinned) is considered
-		 * to be very very small: there are should be very large number of
-		 * concurrent IO operations and them are limited by max_connections,
-		 * we prefer not to complicate code and use second approach.
+		 * operations):
+		 *
+		 * 1) Wait until some of this operation is completed and pages is
+		 * unpinned.
+		 *
+		 * 2) Allocate one more chunk, so that specified cache size is more
+		 * recommendation than hard limit.
+		 *
+		 * As far as probability of such event (that all pages are pinned) is
+		 * considered to be very very small: there are should be very large
+		 * number of concurrent IO operations and them are limited by
+		 * max_connections, we prefer not to complicate code and use second
+		 * approach.
 		 */
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 6a8f8cca70..b930fdb3ca 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -48,9 +48,11 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
-	/* Important: This must happen after other parts of the extension */
-	/* are loaded, otherwise any settings to GUCs that were set before */
-	/* the extension was loaded will be removed. */
+	/*
+	 * Important: This must happen after other parts of the extension are
+	 * loaded, otherwise any settings to GUCs that were set before the
+	 * extension was loaded will be removed.
+	 */
 	EmitWarningsOnPlaceholders("neon");
 }
 
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index d38ef48910..f19732cbbb 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -109,4 +109,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif
 
-#endif	/* //NEON_PGVERSIONCOMPAT_H */
+#endif							/* NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index c9c41cd1cc..609d80588c 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -370,9 +370,8 @@ readahead_buffer_resize(int newsize, void *extra)
 	uint64		end,
 				nfree = newsize;
 	PrefetchState *newPState;
-	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
-																	  sizeof(PrefetchRequest) * newsize
-		);
+	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) +
+		(sizeof(PrefetchRequest) * newsize);
 
 	/* don't try to re-initialize if we haven't initialized yet */
 	if (MyPState == NULL)
@@ -1243,11 +1242,10 @@ PageIsEmptyHeapPage(char *buffer)
 }
 
 static void
-			neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-							 char *buffer, bool force)
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
 #else
-							 const char *buffer, bool force)
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
@@ -1352,9 +1350,8 @@ neon_init(void)
 	if (MyPState != NULL)
 		return;
 
-	prfs_size = offsetof(PrefetchState, prf_buffer) + (
-													   sizeof(PrefetchRequest) * readahead_buffer_size
-		);
+	prfs_size = offsetof(PrefetchState, prf_buffer) +
+		sizeof(PrefetchRequest) * readahead_buffer_size;
 
 	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
 
@@ -1901,8 +1898,8 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	BufferTag	tag;
 	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
+	BufferTag	tag;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1921,11 +1918,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
 		return false;
 
-	tag = (BufferTag)
-	{
-		.forkNum = forknum,
-			.blockNum = blocknum
-	};
+	tag.forkNum = forknum;
+	tag.blockNum = blocknum;
+
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
 	ring_index = prefetch_register_buffer(tag, NULL, NULL);
@@ -1978,26 +1973,23 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
  * While function is defined in the neon extension it's used within neon_test_utils directly.
  * To avoid breaking tests in the runtime please keep function signature in sync.
  */
+void
 #if PG_MAJORVERSION_NUM < 16
-void		PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 #else
-void		PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
 #endif
 {
 	NeonResponse *resp;
-	BufferTag	buftag;
 	uint64		ring_index;
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;
-
-	buftag = (BufferTag)
+	BufferTag	buftag =
 	{
 		.forkNum = forkNum,
-			.blockNum = blkno,
+		.blockNum = blkno,
 	};
 
 	CopyNRelFileInfoToBufTag(buftag, rinfo);
@@ -2120,11 +2112,10 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
  *	neon_read() -- Read the specified block from a relation.
  */
 void
-			neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-					  char *buffer)
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
 #else
-					  void *buffer)
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
 	bool		latest;
@@ -2259,11 +2250,10 @@ hexdump_page(char *page)
  *		use mdextend().
  */
 void
-			neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-					   char *buffer, bool skipFsync)
+neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
 #else
-					   const void *buffer, bool skipFsync)
+neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 #endif
 {
 	XLogRecPtr	lsn;
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index ae7812e710..615018c58e 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -716,7 +716,7 @@ extern void WalProposerFree(WalProposer *wp);
 								 * events */
 
 #ifdef WALPROPOSER_LIB
-void		WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
+extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
 #define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
 #define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)

From 2c544343e0e16eca4704f3477a470f0646a3481f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Dec 2023 10:12:37 +0000
Subject: [PATCH 055/202] pageserver: filtered WAL ingest for sharding (#6024)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Currently, if one creates many shards they will all ingest all the data:
not much use! We want them to ingest a proportional share of the data
each.

Closes: #6025

## Summary of changes

- WalIngest object gets a copy of the ShardIdentity for the Tenant it
was created by.
- While iterating the `blocks` part of a decoded record, blocks that do
not match the current shard are ignored, apart from on shard zero where
they are used to update relation sizes in `observe_decoded_block` (but
not stored).
- Before committing a `DataDirModificiation` from a WAL record, we check
if it's empty, and drop the record if so. This check is necessary
(rather than just looking at the `blocks` part) because certain record
types may modify blocks in non-obvious ways (e.g.
`ingest_heapam_record`).
- Add WAL ingest metrics to record the total received, total committed,
and total filtered out
- Behaviour for unsharded tenants is unchanged: they will continue to
ingest all blocks, and will take the fast path through `is_key_local`
that doesn't bother calculating any hashes.

After this change, shards store a subset of the tenant's total data, and
accurate relation sizes are only maintained on shard zero.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 libs/pageserver_api/src/shard.rs    |  6 +++
 pageserver/src/metrics.rs           | 24 ++++++++++
 pageserver/src/pgdatadir_mapping.rs |  4 ++
 pageserver/src/tenant/timeline.rs   |  2 +-
 pageserver/src/walingest.rs         | 69 ++++++++++++++++++++++++++++-
 5 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index f8f9449d86..9e83e0eee2 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -420,6 +420,12 @@ impl ShardIdentity {
             String::new()
         }
     }
+
+    /// Convenience for checking if this identity is the 0th shard in a tenant,
+    /// for special cases on shard 0 such as ingesting relation sizes.
+    pub fn is_zero(&self) -> bool {
+        self.number == ShardNumber(0)
+    }
 }
 
 impl Serialize for ShardIndex {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3554a93ed9..80113220ae 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1168,6 +1168,30 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });
 
+pub(crate) struct WalIngestMetrics {
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeepers"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d37d953696..c653f0b7ea 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1368,6 +1368,10 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub(crate) fn is_empty(&self) -> bool {
+        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
+    }
+
     // Internal helper functions to batch the modifications
 
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 882a5ef199..551b66b77d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2524,7 +2524,7 @@ impl Timeline {
         Ok(())
     }
 
-    fn finish_write(&self, new_lsn: Lsn) {
+    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
         self.metrics.last_record_gauge.set(new_lsn.0 as i64);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index cbb08f7ff1..75b29a2fed 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,6 +21,7 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -30,6 +31,7 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -46,6 +48,7 @@ use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;
 
 pub struct WalIngest<'a> {
+    shard: ShardIdentity,
     timeline: &'a Timeline,
 
     checkpoint: CheckPoint,
@@ -65,6 +68,7 @@ impl<'a> WalIngest<'a> {
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
         Ok(WalIngest {
+            shard: *timeline.get_shard_identity(),
             timeline,
             checkpoint,
             checkpoint_modified: false,
@@ -87,6 +91,8 @@ impl<'a> WalIngest<'a> {
         decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        WAL_INGEST.records_received.inc();
+
         modification.lsn = lsn;
         decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
 
@@ -355,6 +361,33 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+            let key_is_local = self.shard.is_key_local(&key);
+
+            tracing::debug!(
+                lsn=%lsn,
+                key=%key,
+                "ingest: shard decision {} (checkpoint={})",
+                if !key_is_local { "drop" } else { "keep" },
+                self.checkpoint_modified
+            );
+
+            if !key_is_local {
+                if self.shard.is_zero() {
+                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                    // its blkno in case it implicitly extends a relation.
+                    self.observe_decoded_block(modification, blk, ctx).await?;
+                }
+
+                continue;
+            }
             self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                 .await?;
         }
@@ -367,13 +400,38 @@ impl<'a> WalIngest<'a> {
             self.checkpoint_modified = false;
         }
 
+        if modification.is_empty() {
+            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+            WAL_INGEST.records_filtered.inc();
+            modification.tline.finish_write(lsn);
+        } else {
+            WAL_INGEST.records_committed.inc();
+            modification.commit(ctx).await?;
+        }
+
         // Now that this record has been fully handled, including updating the
-        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit(ctx).await?;
+        // checkpoint data, let the repository know that it is up-to-date to this LSN.
 
         Ok(())
     }
 
+    /// Do not store this block, but observe it for the purposes of updating our relation size state.
+    async fn observe_decoded_block(
+        &mut self,
+        modification: &mut DatadirModification<'_>,
+        blk: &DecodedBkpBlock,
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        let rel = RelTag {
+            spcnode: blk.rnode_spcnode,
+            dbnode: blk.rnode_dbnode,
+            relnode: blk.rnode_relnode,
+            forknum: blk.forknum,
+        };
+        self.handle_rel_extend(modification, rel, blk.blkno, ctx)
+            .await
+    }
+
     async fn ingest_decoded_block(
         &mut self,
         modification: &mut DatadirModification<'_>,
@@ -1465,8 +1523,15 @@ impl<'a> WalIngest<'a> {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
             modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
+            let mut key = rel_block_to_key(rel, blknum);
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
+                key.field6 = gap_blknum;
+
+                if self.shard.get_shard_number(&key) != self.shard.number {
+                    continue;
+                }
+
                 modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
             }
         }

From 699049b8f3ef11547d2f10f2b544720bba45236c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 8 Dec 2023 11:48:37 +0000
Subject: [PATCH 056/202] proxy: make auth more type safe (#5689)

## Problem

https://github.com/neondatabase/neon/blob/a5292f7e67727307341c1d4a372ee7f69efbb626/proxy/src/auth/backend.rs#L146-L148

https://github.com/neondatabase/neon/blob/a5292f7e67727307341c1d4a372ee7f69efbb626/proxy/src/console/provider/neon.rs#L90

https://github.com/neondatabase/neon/blob/a5292f7e67727307341c1d4a372ee7f69efbb626/proxy/src/console/provider/neon.rs#L154

## Summary of changes

1. Test backend is only enabled on `cfg(test)`.
2. Postgres mock backend + MD5 auth keys are only enabled on
`cfg(feature = testing)`
3. Password hack and cleartext flow will have their passwords validated
before proceeding.
4. Distinguish between ClientCredentials with endpoint and without,
removing many panics in the process
---
 proxy/Cargo.toml                      |   6 +-
 proxy/src/auth/backend.rs             | 203 +++++++++++++---------
 proxy/src/auth/backend/classic.rs     |  17 +-
 proxy/src/auth/backend/hacks.rs       |  55 +++---
 proxy/src/auth/backend/link.rs        |  14 +-
 proxy/src/auth/credentials.rs         |  67 ++++----
 proxy/src/auth/flow.rs                |  60 ++++++-
 proxy/src/auth/password_hack.rs       |   5 +-
 proxy/src/bin/proxy.rs                |   2 +
 proxy/src/console/provider.rs         |  10 +-
 proxy/src/console/provider/mock.rs    |  16 +-
 proxy/src/console/provider/neon.rs    |  29 ++--
 proxy/src/proxy.rs                    |  37 ++---
 proxy/src/proxy/tests.rs              |   9 +-
 proxy/src/scram.rs                    |   2 +-
 proxy/src/scram/exchange.rs           | 231 +++++++++++++++++---------
 proxy/src/serverless.rs               |  12 +-
 proxy/src/serverless/conn_pool.rs     |  15 +-
 proxy/src/serverless/sql_over_http.rs |   6 +-
 proxy/src/serverless/websocket.rs     |   4 +-
 20 files changed, 497 insertions(+), 303 deletions(-)

diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 48c8604d86..438190261d 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
@@ -69,6 +73,7 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
+postgres-protocol.workspace = true
 smol_str.workspace = true
 
 workspace_hack.workspace = true
@@ -78,4 +83,3 @@ tokio-util.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
-postgres-protocol.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index aa872285b1..649b3f40f2 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,9 +3,11 @@ mod hacks;
 mod link;
 
 pub use link::LinkAuthError;
+use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
+use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
@@ -24,31 +26,12 @@ use crate::{
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
+use std::net::IpAddr;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};
 
-/// A product of successful authentication.
-pub struct AuthSuccess<T> {
-    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
-    pub reported_auth_ok: bool,
-    /// Something to be considered a positive result.
-    pub value: T,
-}
-
-impl<T> AuthSuccess<T> {
-    /// Very similar to [`std::option::Option::map`].
-    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
-    /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
-        AuthSuccess {
-            reported_auth_ok: self.reported_auth_ok,
-            value: f(self.value),
-        }
-    }
-}
-
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -61,9 +44,11 @@ pub enum BackendType<'a, T> {
     /// Current Cloud API (V2).
     Console(Cow<'a, console::provider::neon::Api>, T),
     /// Local mock of Cloud API (V2).
+    #[cfg(feature = "testing")]
     Postgres(Cow<'a, console::provider::mock::Api>, T),
     /// Authentication via a web browser.
     Link(Cow<'a, url::ApiUrl>),
+    #[cfg(test)]
     /// Test backend.
     Test(&'a dyn TestBackend),
 }
@@ -78,8 +63,10 @@ impl std::fmt::Display for BackendType<'_, ()> {
         use BackendType::*;
         match self {
             Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
+            #[cfg(feature = "testing")]
             Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
             Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            #[cfg(test)]
             Test(_) => fmt.debug_tuple("Test").finish(),
         }
     }
@@ -92,8 +79,10 @@ impl<T> BackendType<'_, T> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(Cow::Borrowed(c), x),
+            #[cfg(feature = "testing")]
             Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
             Link(c) => Link(Cow::Borrowed(c)),
+            #[cfg(test)]
             Test(x) => Test(*x),
         }
     }
@@ -107,8 +96,10 @@ impl<'a, T> BackendType<'a, T> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(c, f(x)),
+            #[cfg(feature = "testing")]
             Postgres(c, x) => Postgres(c, f(x)),
             Link(c) => Link(c),
+            #[cfg(test)]
             Test(x) => Test(x),
         }
     }
@@ -121,51 +112,87 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
         use BackendType::*;
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
+            #[cfg(feature = "testing")]
             Postgres(c, x) => x.map(|x| Postgres(c, x)),
             Link(c) => Ok(Link(c)),
+            #[cfg(test)]
             Test(x) => Ok(Test(x)),
         }
     }
 }
 
-pub enum ComputeCredentials {
+pub struct ComputeCredentials<T> {
+    pub info: ComputeUserInfo,
+    pub keys: T,
+}
+
+pub struct ComputeUserInfoNoEndpoint {
+    pub user: SmolStr,
+    pub peer_addr: IpAddr,
+    pub cache_key: SmolStr,
+}
+
+pub struct ComputeUserInfo {
+    pub endpoint: SmolStr,
+    pub inner: ComputeUserInfoNoEndpoint,
+}
+
+pub enum ComputeCredentialKeys {
+    #[cfg(feature = "testing")]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
 
+impl TryFrom<ClientCredentials> for ComputeUserInfo {
+    // user name
+    type Error = ComputeUserInfoNoEndpoint;
+
+    fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
+        let inner = ComputeUserInfoNoEndpoint {
+            user: creds.user,
+            peer_addr: creds.peer_addr,
+            cache_key: creds.cache_key,
+        };
+        match creds.project {
+            None => Err(inner),
+            Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
+        }
+    }
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks_creds(
+///
+/// All authentication flows will emit an AuthenticationOk message if successful.
+async fn auth_quirks(
     api: &impl console::Api,
     extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
+    creds: ClientCredentials,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
     latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
-    let maybe_success = if creds.project.is_none() {
-        // Password will be checked by the compute node later.
-        Some(hacks::password_hack(creds, client, latency_timer).await?)
-    } else {
-        None
+    let (info, unauthenticated_password) = match creds.try_into() {
+        Err(info) => {
+            let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
+            (res.info, Some(res.keys))
+        }
+        Ok(info) => (info, None),
     };
 
-    // Password hack should set the project name.
-    // TODO: make `creds.project` more type-safe.
-    assert!(creds.project.is_some());
     info!("fetching user's authentication info");
     // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
     let AuthInfo {
         secret,
         allowed_ips,
-    } = api.get_auth_info(extra, creds).await?;
+    } = api.get_auth_info(extra, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
+    if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
     let secret = secret.unwrap_or_else(|| {
@@ -173,36 +200,49 @@ async fn auth_quirks_creds(
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
         info!("authentication info not found, mocking it");
-        AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
     });
 
-    if let Some(success) = maybe_success {
-        return Ok(success);
+    if let Some(password) = unauthenticated_password {
+        let auth_outcome = validate_password_and_exchange(&password, secret)?;
+        let keys = match auth_outcome {
+            crate::sasl::Outcome::Success(key) => key,
+            crate::sasl::Outcome::Failure(reason) => {
+                info!("auth backend failed with an error: {reason}");
+                return Err(auth::AuthError::auth_failed(&*info.inner.user));
+            }
+        };
+
+        // we have authenticated the password
+        client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;
+
+        return Ok(ComputeCredentials { info, keys });
     }
 
+    // -- the remaining flows are self-authenticating --
+
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(client, latency_timer).await;
+        return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(creds, client, config, latency_timer, secret).await
+    classic::authenticate(info, client, config, latency_timer, secret).await
 }
 
-/// True to its name, this function encapsulates our current auth trade-offs.
-/// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks(
+/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
+/// only if authentication was successfuly.
+async fn auth_and_wake_compute(
     api: &impl console::Api,
     extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
+    creds: ClientCredentials,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
     latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    let auth_stuff = auth_quirks_creds(
+) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
+    let compute_credentials = auth_quirks(
         api,
         extra,
         creds,
@@ -215,7 +255,7 @@ async fn auth_quirks(
 
     let mut num_retries = 0;
     let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
+        let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
         match handle_try_wake(wake_res, num_retries) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -232,27 +272,27 @@ async fn auth_quirks(
         tokio::time::sleep(wait_duration).await;
     };
 
-    match auth_stuff.value {
-        ComputeCredentials::Password(password) => node.config.password(password),
-        ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+    match compute_credentials.keys {
+        #[cfg(feature = "testing")]
+        ComputeCredentialKeys::Password(password) => node.config.password(password),
+        ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
     };
 
-    Ok(AuthSuccess {
-        reported_auth_ok: auth_stuff.reported_auth_ok,
-        value: node,
-    })
+    Ok((node, compute_credentials.info))
 }
 
-impl BackendType<'_, ClientCredentials<'_>> {
+impl<'a> BackendType<'a, ClientCredentials> {
     /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<String> {
+    pub fn get_endpoint(&self) -> Option<SmolStr> {
         use BackendType::*;
 
         match self {
             Console(_, creds) => creds.project.clone(),
+            #[cfg(feature = "testing")]
             Postgres(_, creds) => creds.project.clone(),
-            Link(_) => Some("link".to_owned()),
-            Test(_) => Some("test".to_owned()),
+            Link(_) => Some("link".into()),
+            #[cfg(test)]
+            Test(_) => Some("test".into()),
         }
     }
 
@@ -261,9 +301,11 @@ impl BackendType<'_, ClientCredentials<'_>> {
         use BackendType::*;
 
         match self {
-            Console(_, creds) => creds.user,
-            Postgres(_, creds) => creds.user,
+            Console(_, creds) => &creds.user,
+            #[cfg(feature = "testing")]
+            Postgres(_, creds) => &creds.user,
             Link(_) => "link",
+            #[cfg(test)]
             Test(_) => "test",
         }
     }
@@ -271,26 +313,25 @@ impl BackendType<'_, ClientCredentials<'_>> {
     /// Authenticate the client via the requested backend, possibly using credentials.
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
-        &mut self,
+        self,
         extra: &ConsoleReqExtra<'_>,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
         latency_timer: &mut LatencyTimer,
-    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
         use BackendType::*;
 
         let res = match self {
             Console(api, creds) => {
                 info!(
-                    user = creds.user,
+                    user = &*creds.user,
                     project = creds.project(),
                     "performing authentication using the console"
                 );
 
-                let api = api.as_ref();
-                auth_quirks(
-                    api,
+                let (cache_info, user_info) = auth_and_wake_compute(
+                    &*api,
                     extra,
                     creds,
                     client,
@@ -298,18 +339,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
                     config,
                     latency_timer,
                 )
-                .await?
+                .await?;
+                (cache_info, BackendType::Console(api, user_info))
             }
+            #[cfg(feature = "testing")]
             Postgres(api, creds) => {
                 info!(
-                    user = creds.user,
+                    user = &*creds.user,
                     project = creds.project(),
                     "performing authentication using a local postgres instance"
                 );
 
-                let api = api.as_ref();
-                auth_quirks(
-                    api,
+                let (cache_info, user_info) = auth_and_wake_compute(
+                    &*api,
                     extra,
                     creds,
                     client,
@@ -317,16 +359,21 @@ impl BackendType<'_, ClientCredentials<'_>> {
                     config,
                     latency_timer,
                 )
-                .await?
+                .await?;
+                (cache_info, BackendType::Postgres(api, user_info))
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
                 info!("performing link authentication");
 
-                link::authenticate(url, client)
-                    .await?
-                    .map(CachedNodeInfo::new_uncached)
+                let node_info = link::authenticate(&url, client).await?;
+
+                (
+                    CachedNodeInfo::new_uncached(node_info),
+                    BackendType::Link(url),
+                )
             }
+            #[cfg(test)]
             Test(_) => {
                 unreachable!("this function should never be called in the test backend")
             }
@@ -335,7 +382,9 @@ impl BackendType<'_, ClientCredentials<'_>> {
         info!("user successfully authenticated");
         Ok(res)
     }
+}
 
+impl BackendType<'_, ComputeUserInfo> {
     pub async fn get_allowed_ips(
         &self,
         extra: &ConsoleReqExtra<'_>,
@@ -343,8 +392,10 @@ impl BackendType<'_, ClientCredentials<'_>> {
         use BackendType::*;
         match self {
             Console(api, creds) => api.get_allowed_ips(extra, creds).await,
+            #[cfg(feature = "testing")]
             Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
             Link(_) => Ok(Arc::new(vec![])),
+            #[cfg(test)]
             Test(x) => x.get_allowed_ips(),
         }
     }
@@ -359,8 +410,10 @@ impl BackendType<'_, ClientCredentials<'_>> {
 
         match self {
             Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
+            #[cfg(feature = "testing")]
             Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
             Link(_) => Ok(None),
+            #[cfg(test)]
             Test(x) => x.wake_compute().map(Some),
         }
     }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index bb210821cd..ce52daf16c 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,6 +1,6 @@
-use super::{AuthSuccess, ComputeCredentials};
+use super::{ComputeCredentials, ComputeUserInfo};
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, backend::ComputeCredentialKeys, AuthFlow},
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
@@ -12,14 +12,15 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
-    creds: &ClientCredentials<'_>,
+    creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
     latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
+        #[cfg(feature = "testing")]
         AuthSecret::Md5(_) => {
             info!("auth endpoint chooses MD5");
             return Err(auth::AuthError::bad_auth_method("MD5"));
@@ -53,7 +54,7 @@ pub(super) async fn authenticate(
                 sasl::Outcome::Success(key) => key,
                 sasl::Outcome::Failure(reason) => {
                     info!("auth backend failed with an error: {reason}");
-                    return Err(auth::AuthError::auth_failed(creds.user));
+                    return Err(auth::AuthError::auth_failed(&*creds.inner.user));
                 }
             };
 
@@ -64,9 +65,9 @@ pub(super) async fn authenticate(
         }
     };
 
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+    Ok(ComputeCredentials {
+        info: creds,
+        keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
             scram_keys,
         )),
     })
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 4448dbc56a..abbd25008b 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,7 +1,11 @@
-use super::{AuthSuccess, ComputeCredentials};
+use super::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
+};
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, AuthFlow},
+    console::AuthSecret,
     proxy::LatencyTimer,
+    sasl,
     stream::{self, Stream},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -11,35 +15,42 @@ use tracing::{info, warn};
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
-pub async fn cleartext_hack(
+pub async fn authenticate_cleartext(
+    info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+    secret: AuthSecret,
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     warn!("cleartext auth flow override is enabled, proceeding");
 
     // pause the timer while we communicate with the client
     let _paused = latency_timer.pause();
 
-    let password = AuthFlow::new(client)
-        .begin(auth::CleartextPassword)
+    let auth_outcome = AuthFlow::new(client)
+        .begin(auth::CleartextPassword(secret))
         .await?
         .authenticate()
         .await?;
 
-    // Report tentative success; compute node will check the password anyway.
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: ComputeCredentials::Password(password),
-    })
+    let keys = match auth_outcome {
+        sasl::Outcome::Success(key) => key,
+        sasl::Outcome::Failure(reason) => {
+            info!("auth backend failed with an error: {reason}");
+            return Err(auth::AuthError::auth_failed(&*info.inner.user));
+        }
+    };
+
+    Ok(ComputeCredentials { info, keys })
 }
 
 /// Workaround for clients which don't provide an endpoint (project) name.
-/// Very similar to [`cleartext_hack`], but there's a specific password format.
-pub async fn password_hack(
-    creds: &mut ClientCredentials<'_>,
+/// Similar to [`authenticate_cleartext`], but there's a specific password format,
+/// and passwords are not yet validated (we don't know how to validate them!)
+pub async fn password_hack_no_authentication(
+    info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<ComputeCredentials<Vec<u8>>> {
     warn!("project not specified, resorting to the password hack auth flow");
 
     // pause the timer while we communicate with the client
@@ -48,15 +59,17 @@ pub async fn password_hack(
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
         .await?
-        .authenticate()
+        .get_password()
         .await?;
 
-    info!(project = &payload.endpoint, "received missing parameter");
-    creds.project = Some(payload.endpoint);
+    info!(project = &*payload.endpoint, "received missing parameter");
 
     // Report tentative success; compute node will check the password anyway.
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: ComputeCredentials::Password(payload.password),
+    Ok(ComputeCredentials {
+        info: ComputeUserInfo {
+            inner: info,
+            endpoint: payload.endpoint,
+        },
+        keys: payload.password,
     })
 }
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 3a77d7e5ca..2cf7e3acc7 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,4 +1,3 @@
-use super::AuthSuccess;
 use crate::{
     auth, compute,
     console::{self, provider::NodeInfo},
@@ -57,7 +56,7 @@ pub fn new_psql_session_id() -> String {
 pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<NodeInfo>> {
+) -> auth::Result<NodeInfo> {
     let psql_session_id = new_psql_session_id();
     let span = info_span!("link", psql_session_id = &psql_session_id);
     let greeting = hello_message(link_uri, &psql_session_id);
@@ -102,12 +101,9 @@ pub(super) async fn authenticate(
         config.password(password.as_ref());
     }
 
-    Ok(AuthSuccess {
-        reported_auth_ok: true,
-        value: NodeInfo {
-            config,
-            aux: db_info.aux,
-            allow_self_signed_compute: false, // caller may override
-        },
+    Ok(NodeInfo {
+        config,
+        aux: db_info.aux,
+        allow_self_signed_compute: false, // caller may override
     })
 }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index facb8da8cd..dd7c58255f 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -7,10 +7,8 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{
-    collections::HashSet,
-    net::{IpAddr, SocketAddr},
-};
+use smol_str::SmolStr;
+use std::{collections::HashSet, net::IpAddr};
 use thiserror::Error;
 use tracing::{info, warn};
 
@@ -24,7 +22,7 @@ pub enum ClientCredsParseError {
          SNI ('{}') and project option ('{}').",
         .domain, .option,
     )]
-    InconsistentProjectNames { domain: String, option: String },
+    InconsistentProjectNames { domain: SmolStr, option: SmolStr },
 
     #[error(
         "Common name inferred from SNI ('{}') is not known",
@@ -33,7 +31,7 @@ pub enum ClientCredsParseError {
     UnknownCommonName { cn: String },
 
     #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(String),
+    MalformedProjectName(SmolStr),
 }
 
 impl UserFacingError for ClientCredsParseError {}
@@ -41,34 +39,34 @@ impl UserFacingError for ClientCredsParseError {}
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ClientCredentials<'a> {
-    pub user: &'a str,
+pub struct ClientCredentials {
+    pub user: SmolStr,
     // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<String>,
+    pub project: Option<SmolStr>,
 
-    pub cache_key: String,
-    pub peer_addr: SocketAddr,
+    pub cache_key: SmolStr,
+    pub peer_addr: IpAddr,
 }
 
-impl ClientCredentials<'_> {
+impl ClientCredentials {
     #[inline]
     pub fn project(&self) -> Option<&str> {
         self.project.as_deref()
     }
 }
 
-impl<'a> ClientCredentials<'a> {
+impl ClientCredentials {
     pub fn parse(
-        params: &'a StartupMessageParams,
+        params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<HashSet<String>>,
-        peer_addr: SocketAddr,
+        peer_addr: IpAddr,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
         // Some parameters are stored in the startup message.
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user = get_param("user")?;
+        let user = get_param("user")?.into();
 
         // Project name might be passed via PG's command-line options.
         let project_option = params
@@ -82,7 +80,7 @@ impl<'a> ClientCredentials<'a> {
                     .at_most_one()
                     .ok()?
             })
-            .map(|name| name.to_string());
+            .map(|name| name.into());
 
         let project_from_domain = if let Some(sni_str) = sni {
             if let Some(cn) = common_names {
@@ -121,7 +119,7 @@ impl<'a> ClientCredentials<'a> {
         }
         .transpose()?;
 
-        info!(user, project = project.as_deref(), "credentials");
+        info!(%user, project = project.as_deref(), "credentials");
         if sni.is_some() {
             info!("Connection with sni");
             NUM_CONNECTION_ACCEPTED_BY_SNI
@@ -143,7 +141,8 @@ impl<'a> ClientCredentials<'a> {
             "{}{}",
             project.as_deref().unwrap_or(""),
             neon_options(params).unwrap_or("".to_string())
-        );
+        )
+        .into();
 
         Ok(Self {
             user,
@@ -206,10 +205,10 @@ fn project_name_valid(name: &str) -> bool {
     name.chars().all(|c| c.is_alphanumeric() || c == '-')
 }
 
-fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
+fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
     sni.strip_suffix(common_name)?
         .strip_suffix('.')
-        .map(str::to_owned)
+        .map(SmolStr::from)
 }
 
 #[cfg(test)]
@@ -221,7 +220,7 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
@@ -236,7 +235,7 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
@@ -251,7 +250,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -267,7 +266,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -282,7 +281,7 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -300,7 +299,7 @@ mod tests {
             ),
         ]);
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert!(creds.project.is_none());
@@ -315,7 +314,7 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert!(creds.project.is_none());
@@ -330,7 +329,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -344,13 +343,13 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
         assert_eq!(creds.project.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
         assert_eq!(creds.project.as_deref(), Some("p1"));
 
@@ -365,7 +364,7 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
             .expect_err("should fail");
         match err {
@@ -384,7 +383,7 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
             .expect_err("should fail");
         match err {
@@ -404,7 +403,7 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
         assert_eq!(creds.project.as_deref(), Some("project"));
         assert_eq!(
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index efb90733d6..3151a77263 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,8 +1,9 @@
 //! Main authentication flow.
 
-use super::{AuthErrorImpl, PasswordHackPayload};
+use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
     config::TlsServerEndPoint,
+    console::AuthSecret,
     sasl, scram,
     stream::{PqStream, Stream},
 };
@@ -50,7 +51,7 @@ impl AuthMethod for PasswordHack {
 
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword;
+pub struct CleartextPassword(pub AuthSecret);
 
 impl AuthMethod for CleartextPassword {
     #[inline(always)]
@@ -98,7 +99,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
+    pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
@@ -117,13 +118,19 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<Vec<u8>> {
+    pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        Ok(password.to_vec())
+        let outcome = validate_password_and_exchange(password, self.state.0)?;
+
+        if let sasl::Outcome::Success(_) = &outcome {
+            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
+        Ok(outcome)
     }
 }
 
@@ -152,6 +159,49 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             ))
             .await?;
 
+        if let sasl::Outcome::Success(_) = &outcome {
+            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
         Ok(outcome)
     }
 }
+
+pub(super) fn validate_password_and_exchange(
+    password: &[u8],
+    secret: AuthSecret,
+) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
+    match secret {
+        #[cfg(feature = "testing")]
+        AuthSecret::Md5(_) => {
+            // test only
+            Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
+                password.to_owned(),
+            )))
+        }
+        // perform scram authentication as both client and server to validate the keys
+        AuthSecret::Scram(scram_secret) => {
+            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
+            let outcome = crate::scram::exchange(
+                &scram_secret,
+                sasl_client,
+                crate::config::TlsServerEndPoint::Undefined,
+            )?;
+
+            let client_key = match outcome {
+                sasl::Outcome::Success(client_key) => client_key,
+                sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)),
+            };
+
+            let keys = crate::compute::ScramKeys {
+                client_key: client_key.as_bytes(),
+                server_key: scram_secret.server_key.as_bytes(),
+            };
+
+            Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
+                tokio_postgres::config::AuthKeys::ScramSha256(keys),
+            )))
+        }
+    }
+}
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index d1da208fef..372b0764ee 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -4,9 +4,10 @@
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.
 
 use bstr::ByteSlice;
+use smol_str::SmolStr;
 
 pub struct PasswordHackPayload {
-    pub endpoint: String,
+    pub endpoint: SmolStr,
     pub password: Vec<u8>,
 }
 
@@ -18,7 +19,7 @@ impl PasswordHackPayload {
             if let Some((endpoint, password)) = bytes.split_once_str(sep) {
                 let endpoint = endpoint.to_str().ok()?;
                 return Some(Self {
-                    endpoint: parse_endpoint_param(endpoint)?.to_owned(),
+                    endpoint: parse_endpoint_param(endpoint)?.into(),
                     password: password.to_owned(),
                 });
             }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 6c4189de75..fc1c44809a 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -30,6 +30,7 @@ use clap::{Parser, ValueEnum};
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackend {
     Console,
+    #[cfg(feature = "testing")]
     Postgres,
     Link,
 }
@@ -289,6 +290,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let api = console::provider::neon::Api::new(endpoint, caches, locks);
             auth::BackendType::Console(Cow::Owned(api), ())
         }
+        #[cfg(feature = "testing")]
         AuthBackend::Postgres => {
             let url = args.auth_endpoint.parse()?;
             let api = console::provider::mock::Api::new(url);
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index e735b9f66c..ccb5cbdb92 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -1,9 +1,10 @@
+#[cfg(feature = "testing")]
 pub mod mock;
 pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::ClientCredentials,
+    auth::backend::ComputeUserInfo,
     cache::{timed_lru, TimedLru},
     compute, scram,
 };
@@ -205,6 +206,7 @@ pub struct ConsoleReqExtra<'a> {
 
 /// Auth secret which is managed by the cloud.
 pub enum AuthSecret {
+    #[cfg(feature = "testing")]
     /// Md5 hash of user's password.
     Md5([u8; 16]),
 
@@ -247,20 +249,20 @@ pub trait Api {
     async fn get_auth_info(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
 
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 4cc68f0ac1..8aad8c06bc 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,7 +6,7 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use thiserror::Error;
@@ -47,7 +47,7 @@ impl Api {
 
     async fn do_get_auth_info(
         &self,
-        creds: &ClientCredentials<'_>,
+        creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         let (secret, allowed_ips) = async {
             // Perhaps we could persist this connection, but then we'd have to
@@ -60,7 +60,7 @@ impl Api {
             let secret = match get_execute_postgres_query(
                 &client,
                 "select rolpassword from pg_catalog.pg_authid where rolname = $1",
-                &[&creds.user],
+                &[&&*creds.inner.user],
                 "rolpassword",
             )
             .await?
@@ -71,14 +71,14 @@ impl Api {
                     secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
                 }
                 None => {
-                    warn!("user '{}' does not exist", creds.user);
+                    warn!("user '{}' does not exist", creds.inner.user);
                     None
                 }
             };
             let allowed_ips = match get_execute_postgres_query(
                 &client,
                 "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                &[&creds.project.clone().unwrap_or_default().as_str()],
+                &[&creds.endpoint.as_str()],
                 "allowed_ips",
             )
             .await?
@@ -145,7 +145,7 @@ impl super::Api for Api {
     async fn get_auth_info(
         &self,
         _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         self.do_get_auth_info(creds).await
     }
@@ -153,7 +153,7 @@ impl super::Api for Api {
     async fn get_allowed_ips(
         &self,
         _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
     }
@@ -162,7 +162,7 @@ impl super::Api for Api {
     async fn wake_compute(
         &self,
         _extra: &ConsoleReqExtra<'_>,
-        _creds: &ClientCredentials,
+        _creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute()
             .map_ok(CachedNodeInfo::new_uncached)
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 7828a7d7e4..f0510e91ea 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -5,12 +5,8 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::{
-    auth::ClientCredentials,
-    compute, http,
-    proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
-    scram,
-};
+use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
+use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -53,7 +49,7 @@ impl Api {
     async fn do_get_auth_info(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         let request_id = uuid::Uuid::new_v4().to_string();
         async {
@@ -65,8 +61,8 @@ impl Api {
                 .query(&[("session_id", extra.session_id)])
                 .query(&[
                     ("application_name", extra.application_name),
-                    ("project", Some(creds.project().expect("impossible"))),
-                    ("role", Some(creds.user)),
+                    ("project", Some(&creds.endpoint)),
+                    ("role", Some(&creds.inner.user)),
                 ])
                 .build()?;
 
@@ -106,9 +102,8 @@ impl Api {
     async fn do_wake_compute(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
-        let project = creds.project().expect("impossible");
         let request_id = uuid::Uuid::new_v4().to_string();
         async {
             let request = self
@@ -119,7 +114,7 @@ impl Api {
                 .query(&[("session_id", extra.session_id)])
                 .query(&[
                     ("application_name", extra.application_name),
-                    ("project", Some(project)),
+                    ("project", Some(&creds.endpoint)),
                     ("options", extra.options),
                 ])
                 .build()?;
@@ -162,7 +157,7 @@ impl super::Api for Api {
     async fn get_auth_info(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         self.do_get_auth_info(extra, creds).await
     }
@@ -170,9 +165,9 @@ impl super::Api for Api {
     async fn get_allowed_ips(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
-        let key: &str = creds.project().expect("impossible");
+        let key: &str = &creds.endpoint;
         if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
@@ -193,9 +188,9 @@ impl super::Api for Api {
     async fn wake_compute(
         &self,
         extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key: &str = &creds.cache_key;
+        let key: &str = &creds.inner.cache_key;
 
         // Every time we do a wakeup http request, the compute node will stay up
         // for some time (highly depends on the console's scale-to-zero policy);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 3b6d9cb61d..7cf3ed5b8a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,7 +2,7 @@
 mod tests;
 
 use crate::{
-    auth::{self, backend::AuthSuccess},
+    auth,
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{AuthenticationConfig, ProxyConfig, TlsConfig},
@@ -24,7 +24,7 @@ use prometheus::{
     IntGaugeVec,
 };
 use regex::Regex;
-use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant};
+use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
     io::{AsyncRead, AsyncWrite, AsyncWriteExt},
     time,
@@ -318,7 +318,7 @@ pub async fn task_main(
                             .set_nodelay(true)
                             .context("failed to set socket option")?;
 
-                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await
+                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr.ip()).await
                     }
                     .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
                     .unwrap_or_else(move |e| {
@@ -408,7 +408,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     session_id: uuid::Uuid,
     stream: S,
     mode: ClientMode,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<()> {
     info!(
         protocol = mode.protocol_label(),
@@ -666,7 +666,7 @@ pub async fn connect_to_compute<M: ConnectMechanism>(
     mechanism: &M,
     mut node_info: console::CachedNodeInfo,
     extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
     mut latency_timer: LatencyTimer,
 ) -> Result<M::Connection, M::Error>
 where
@@ -696,10 +696,12 @@ where
     let node_info = loop {
         let wake_res = match creds {
             auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            #[cfg(feature = "testing")]
             auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
             // nothing to do?
             auth::BackendType::Link(_) => return Err(err.into()),
             // test backend
+            #[cfg(test)]
             auth::BackendType::Test(x) => x.wake_compute(),
         };
 
@@ -838,7 +840,6 @@ pub fn retry_after(num_retries: u32) -> time::Duration {
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
     node: &compute::PostgresConnection,
-    reported_auth_ok: bool,
     session: cancellation::Session<'_>,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
@@ -846,13 +847,6 @@ async fn prepare_client_connection(
     // The new token (cancel_key_data) will be sent to the client.
     let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
 
-    // Report authentication success if we haven't done this already.
-    // Note that we do this only (for the most part) after we've connected
-    // to a compute (see above) which performs its own authentication.
-    if !reported_auth_ok {
-        stream.write_message_noflush(&Be::AuthenticationOk)?;
-    }
-
     // Forward all postgres connection params to the client.
     // Right now the implementation is very hacky and inefficent (ideally,
     // we don't need an intermediate hashmap), but at least it should be correct.
@@ -921,7 +915,7 @@ struct Client<'a, S> {
     /// The underlying libpq protocol stream.
     stream: PqStream<Stream<S>>,
     /// Client credentials that we care about.
-    creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
+    creds: auth::BackendType<'a, auth::ClientCredentials>,
     /// KV-dictionary with PostgreSQL connection params.
     params: &'a StartupMessageParams,
     /// Unique connection ID.
@@ -934,7 +928,7 @@ impl<'a, S> Client<'a, S> {
     /// Construct a new connection context.
     fn new(
         stream: PqStream<Stream<S>>,
-        creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
+        creds: auth::BackendType<'a, auth::ClientCredentials>,
         params: &'a StartupMessageParams,
         session_id: uuid::Uuid,
         allow_self_signed_compute: bool,
@@ -953,7 +947,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     /// Let the client authenticate and connect to the designated compute node.
     // Instrumentation logs endpoint name everywhere. Doesn't work for link
     // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)]
+    #[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)]
     async fn connect_to_db(
         self,
         session: cancellation::Session<'_>,
@@ -962,7 +956,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     ) -> anyhow::Result<()> {
         let Self {
             mut stream,
-            mut creds,
+            creds,
             params,
             session_id,
             allow_self_signed_compute,
@@ -978,6 +972,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
 
         let mut latency_timer = LatencyTimer::new(mode.protocol_label());
 
+        let user = creds.get_user().to_owned();
         let auth_result = match creds
             .authenticate(
                 &extra,
@@ -990,7 +985,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         {
             Ok(auth_result) => auth_result,
             Err(e) => {
-                let user = creds.get_user();
                 let db = params.get("database");
                 let app = params.get("application_name");
                 let params_span = tracing::info_span!("", ?user, ?db, ?app);
@@ -999,10 +993,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             }
         };
 
-        let AuthSuccess {
-            reported_auth_ok,
-            value: mut node_info,
-        } = auth_result;
+        let (mut node_info, creds) = auth_result;
 
         node_info.allow_self_signed_compute = allow_self_signed_compute;
 
@@ -1025,7 +1016,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
         }
 
-        prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
+        prepare_client_connection(&node, session, &mut stream).await?;
         // Before proxy passing, forward to compute whatever data is left in the
         // PqStream input buffer. Normally there is none, but our serverless npm
         // driver in pipeline mode sends startup, password and first query
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index b97c0efce4..222661db4a 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -3,8 +3,7 @@
 mod mitm;
 
 use super::*;
-use crate::auth::backend::TestBackend;
-use crate::auth::ClientCredentials;
+use crate::auth::backend::{ComputeUserInfo, TestBackend};
 use crate::config::CertResolver;
 use crate::console::{CachedNodeInfo, NodeInfo};
 use crate::{auth, http, sasl, scram};
@@ -109,8 +108,9 @@ fn generate_tls_config<'a>(
 trait TestAuth: Sized {
     async fn authenticate<S: AsyncRead + AsyncWrite + Unpin + Send>(
         self,
-        _stream: &mut PqStream<Stream<S>>,
+        stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
+        stream.write_message_noflush(&Be::AuthenticationOk)?;
         Ok(())
     }
 }
@@ -168,7 +168,6 @@ async fn dummy_proxy(
     auth.authenticate(&mut stream).await?;
 
     stream
-        .write_message_noflush(&Be::AuthenticationOk)?
         .write_message_noflush(&Be::CLIENT_ENCODING)?
         .write_message(&Be::ReadyForQuery)
         .await?;
@@ -486,7 +485,7 @@ fn helper_create_connect_info(
 ) -> (
     CachedNodeInfo,
     console::ConsoleReqExtra<'static>,
-    auth::BackendType<'_, ClientCredentials<'static>>,
+    auth::BackendType<'_, ComputeUserInfo>,
 ) {
     let cache = helper_create_cached_node_info();
     let extra = console::ConsoleReqExtra {
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 63271309e1..49a7a13043 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -15,7 +15,7 @@ mod signature;
 #[cfg(any(test, doc))]
 mod password;
 
-pub use exchange::Exchange;
+pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
 pub use secret::ServerSecret;
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index facaba3798..9af7db5201 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -1,5 +1,9 @@
 //! Implementation of the SCRAM authentication algorithm.
 
+use std::convert::Infallible;
+
+use postgres_protocol::authentication::sasl::ScramSha256;
+
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
@@ -29,22 +33,27 @@ impl std::str::FromStr for TlsServerEndPoint {
     }
 }
 
+struct SaslSentInner {
+    cbind_flag: ChannelBinding<TlsServerEndPoint>,
+    client_first_message_bare: String,
+    server_first_message: OwnedServerFirstMessage,
+}
+
+struct SaslInitial {
+    nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
+}
+
 enum ExchangeState {
     /// Waiting for [`ClientFirstMessage`].
-    Initial,
+    Initial(SaslInitial),
     /// Waiting for [`ClientFinalMessage`].
-    SaltSent {
-        cbind_flag: ChannelBinding<TlsServerEndPoint>,
-        client_first_message_bare: String,
-        server_first_message: OwnedServerFirstMessage,
-    },
+    SaltSent(SaslSentInner),
 }
 
 /// Server's side of SCRAM auth algorithm.
 pub struct Exchange<'a> {
     state: ExchangeState,
     secret: &'a ServerSecret,
-    nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
     tls_server_end_point: config::TlsServerEndPoint,
 }
 
@@ -55,90 +64,160 @@ impl<'a> Exchange<'a> {
         tls_server_end_point: config::TlsServerEndPoint,
     ) -> Self {
         Self {
-            state: ExchangeState::Initial,
+            state: ExchangeState::Initial(SaslInitial { nonce }),
             secret,
-            nonce,
             tls_server_end_point,
         }
     }
 }
 
+pub fn exchange(
+    secret: &ServerSecret,
+    mut client: ScramSha256,
+    tls_server_end_point: config::TlsServerEndPoint,
+) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
+    use sasl::Step::*;
+
+    let init = SaslInitial {
+        nonce: rand::random,
+    };
+
+    let client_first = std::str::from_utf8(client.message())
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+    let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
+        Continue(sent, server_first) => {
+            client.update(server_first.as_bytes())?;
+            sent
+        }
+        Success(x, _) => match x {},
+        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
+    };
+
+    let client_final = std::str::from_utf8(client.message())
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+    let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
+        Success(keys, server_final) => {
+            client.finish(server_final.as_bytes())?;
+            keys
+        }
+        Continue(x, _) => match x {},
+        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
+    };
+
+    Ok(sasl::Outcome::Success(keys))
+}
+
+impl SaslInitial {
+    fn transition(
+        &self,
+        secret: &ServerSecret,
+        tls_server_end_point: &config::TlsServerEndPoint,
+        input: &str,
+    ) -> sasl::Result<sasl::Step<SaslSentInner, Infallible>> {
+        let client_first_message = ClientFirstMessage::parse(input)
+            .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
+
+        // If the flag is set to "y" and the server supports channel
+        // binding, the server MUST fail authentication
+        if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
+            && tls_server_end_point.supported()
+        {
+            return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
+        }
+
+        let server_first_message = client_first_message.build_server_first_message(
+            &(self.nonce)(),
+            &secret.salt_base64,
+            secret.iterations,
+        );
+        let msg = server_first_message.as_str().to_owned();
+
+        let next = SaslSentInner {
+            cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
+            client_first_message_bare: client_first_message.bare.to_owned(),
+            server_first_message,
+        };
+
+        Ok(sasl::Step::Continue(next, msg))
+    }
+}
+
+impl SaslSentInner {
+    fn transition(
+        &self,
+        secret: &ServerSecret,
+        tls_server_end_point: &config::TlsServerEndPoint,
+        input: &str,
+    ) -> sasl::Result<sasl::Step<Infallible, super::ScramKey>> {
+        let Self {
+            cbind_flag,
+            client_first_message_bare,
+            server_first_message,
+        } = self;
+
+        let client_final_message = ClientFinalMessage::parse(input)
+            .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
+
+        let channel_binding = cbind_flag.encode(|_| match tls_server_end_point {
+            config::TlsServerEndPoint::Sha256(x) => Ok(x),
+            config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
+        })?;
+
+        // This might've been caused by a MITM attack
+        if client_final_message.channel_binding != channel_binding {
+            return Err(SaslError::ChannelBindingFailed(
+                "insecure connection: secure channel data mismatch",
+            ));
+        }
+
+        if client_final_message.nonce != server_first_message.nonce() {
+            return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
+        }
+
+        let signature_builder = SignatureBuilder {
+            client_first_message_bare,
+            server_first_message: server_first_message.as_str(),
+            client_final_message_without_proof: client_final_message.without_proof,
+        };
+
+        let client_key = signature_builder
+            .build(&secret.stored_key)
+            .derive_client_key(&client_final_message.proof);
+
+        // Auth fails either if keys don't match or it's pre-determined to fail.
+        if client_key.sha256() != secret.stored_key || secret.doomed {
+            return Ok(sasl::Step::Failure("password doesn't match"));
+        }
+
+        let msg =
+            client_final_message.build_server_final_message(signature_builder, &secret.server_key);
+
+        Ok(sasl::Step::Success(client_key, msg))
+    }
+}
+
 impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
         use {sasl::Step::*, ExchangeState::*};
         match &self.state {
-            Initial => {
-                let client_first_message = ClientFirstMessage::parse(input)
-                    .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
-
-                // If the flag is set to "y" and the server supports channel
-                // binding, the server MUST fail authentication
-                if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
-                    && self.tls_server_end_point.supported()
-                {
-                    return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
+            Initial(init) => {
+                match init.transition(self.secret, &self.tls_server_end_point, input)? {
+                    Continue(sent, msg) => {
+                        self.state = SaltSent(sent);
+                        Ok(Continue(self, msg))
+                    }
+                    Success(x, _) => match x {},
+                    Failure(msg) => Ok(Failure(msg)),
                 }
-
-                let server_first_message = client_first_message.build_server_first_message(
-                    &(self.nonce)(),
-                    &self.secret.salt_base64,
-                    self.secret.iterations,
-                );
-                let msg = server_first_message.as_str().to_owned();
-
-                self.state = SaltSent {
-                    cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
-                    client_first_message_bare: client_first_message.bare.to_owned(),
-                    server_first_message,
-                };
-
-                Ok(Continue(self, msg))
             }
-            SaltSent {
-                cbind_flag,
-                client_first_message_bare,
-                server_first_message,
-            } => {
-                let client_final_message = ClientFinalMessage::parse(input)
-                    .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
-
-                let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point {
-                    config::TlsServerEndPoint::Sha256(x) => Ok(x),
-                    config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
-                })?;
-
-                // This might've been caused by a MITM attack
-                if client_final_message.channel_binding != channel_binding {
-                    return Err(SaslError::ChannelBindingFailed(
-                        "insecure connection: secure channel data mismatch",
-                    ));
+            SaltSent(sent) => {
+                match sent.transition(self.secret, &self.tls_server_end_point, input)? {
+                    Success(keys, msg) => Ok(Success(keys, msg)),
+                    Continue(x, _) => match x {},
+                    Failure(msg) => Ok(Failure(msg)),
                 }
-
-                if client_final_message.nonce != server_first_message.nonce() {
-                    return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
-                }
-
-                let signature_builder = SignatureBuilder {
-                    client_first_message_bare,
-                    server_first_message: server_first_message.as_str(),
-                    client_final_message_without_proof: client_final_message.without_proof,
-                };
-
-                let client_key = signature_builder
-                    .build(&self.secret.stored_key)
-                    .derive_client_key(&client_final_message.proof);
-
-                // Auth fails either if keys don't match or it's pre-determined to fail.
-                if client_key.sha256() != self.secret.stored_key || self.secret.doomed {
-                    return Ok(Failure("password doesn't match"));
-                }
-
-                let msg = client_final_message
-                    .build_server_final_message(signature_builder, &self.secret.server_key);
-
-                Ok(Success(client_key, msg))
             }
         }
     }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 45f8132393..5a992d6461 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -23,7 +23,7 @@ use hyper::{
     Body, Method, Request, Response,
 };
 
-use std::net::SocketAddr;
+use std::net::IpAddr;
 use std::task::Poll;
 use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
@@ -103,7 +103,13 @@ pub async fn task_main(
                             let session_id = uuid::Uuid::new_v4();
 
                             request_handler(
-                                req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr,
+                                req,
+                                config,
+                                conn_pool,
+                                cancel_map,
+                                session_id,
+                                sni_name,
+                                peer_addr.ip(),
                             )
                             .instrument(info_span!(
                                 "serverless",
@@ -171,7 +177,7 @@ async fn request_handler(
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
     sni_hostname: Option<String>,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> Result<Response<Body>, ApiError> {
     let host = request
         .headers()
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index ca7a9ad0a0..b9d1a9692d 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,4 +1,4 @@
-use anyhow::Context;
+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use dashmap::DashMap;
 use futures::future::poll_fn;
@@ -9,7 +9,7 @@ use pbkdf2::{
 };
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::SocketAddr, sync::Arc};
+use std::{collections::HashMap, net::IpAddr, sync::Arc};
 use std::{
     fmt,
     task::{ready, Poll},
@@ -22,7 +22,7 @@ use tokio::time;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 
 use crate::{
-    auth::{self, check_peer_addr_is_in_list},
+    auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
     console,
     proxy::{
         neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
@@ -146,7 +146,7 @@ impl GlobalConnPool {
         conn_info: &ConnInfo,
         force_new: bool,
         session_id: uuid::Uuid,
-        peer_addr: SocketAddr,
+        peer_addr: IpAddr,
     ) -> anyhow::Result<Client> {
         let mut client: Option<ClientInner> = None;
         let mut latency_timer = LatencyTimer::new("http");
@@ -406,7 +406,7 @@ async fn connect_to_compute(
     conn_id: uuid::Uuid,
     session_id: uuid::Uuid,
     latency_timer: LatencyTimer,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<ClientInner> {
     let tls = config.tls_config.as_ref();
     let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -423,6 +423,9 @@ async fn connect_to_compute(
         common_names,
         peer_addr,
     )?;
+
+    let creds =
+        ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?;
     let backend = config.auth_backend.as_ref().map(|_| creds);
 
     let console_options = neon_options(&params);
@@ -435,7 +438,7 @@ async fn connect_to_compute(
     // TODO(anna): this is a bit hacky way, consider using console notification listener.
     if !config.disable_ip_check_for_http {
         let allowed_ips = backend.get_allowed_ips(&extra).await?;
-        if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) {
+        if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
             return Err(auth::AuthError::ip_address_not_allowed().into());
         }
     }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6c337a837c..6e80260193 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,4 +1,4 @@
-use std::net::SocketAddr;
+use std::net::IpAddr;
 use std::sync::Arc;
 
 use anyhow::bail;
@@ -202,7 +202,7 @@ pub async fn handle(
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
     session_id: uuid::Uuid,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
     config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
@@ -301,7 +301,7 @@ async fn handle_inner(
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
     session_id: uuid::Uuid,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<Response<Body>> {
     NUM_CONNECTIONS_ACCEPTED_COUNTER
         .with_label_values(&["http"])
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 8fb9a3dee4..199b03550d 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -11,7 +11,7 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
 
 use std::{
-    net::SocketAddr,
+    net::IpAddr,
     pin::Pin,
     task::{ready, Context, Poll},
 };
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     cancel_map: &CancelMap,
     session_id: uuid::Uuid,
     hostname: Option<String>,
-    peer_addr: SocketAddr,
+    peer_addr: IpAddr,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     handle_client(

From 5e98855d80059412ac480ba2b883d5d8d0c3efea Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Dec 2023 14:52:37 +0000
Subject: [PATCH 057/202] tests: update tests that used local_fs&mock_s3 to use
 one or the other (#6015)

## Problem

This was wasting resources: if we run a test with mock s3 we don't then
need to run it again with local fs. When we're running in CI, we don't
need to run with the mock/local storage as well as real S3. There is
some value in having CI notice/spot issues that might otherwise only
happen when running locally, but that doesn't justify the cost of
running the tests so many more times on every PR.

## Summary of changes

- For tests that used available_remote_storages or
available_s3_storages, update them to either specify no remote storage
(therefore inherit the default, which is currently local fs), or to
specify s3_storage() for the tests that actually want an S3 API.
---
 test_runner/fixtures/neon_fixtures.py         |   3 +-
 test_runner/fixtures/remote_storage.py        |   7 +
 test_runner/regress/test_ondemand_download.py |  22 +--
 test_runner/regress/test_tenant_delete.py     | 155 ++++++++----------
 test_runner/regress/test_tenant_detach.py     |  15 +-
 test_runner/regress/test_tenant_relocation.py |   9 -
 test_runner/regress/test_tenants.py           |  10 +-
 .../test_tenants_with_remote_storage.py       |  13 +-
 test_runner/regress/test_timeline_delete.py   | 132 ++++++---------
 test_runner/regress/test_wal_acceptor.py      |  15 +-
 10 files changed, 142 insertions(+), 239 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 617d4806b2..9e0beeb4d1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -56,6 +56,7 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     RemoteStorageUser,
     S3Storage,
+    default_remote_storage,
     remote_storage_to_toml_inline_table,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -468,7 +469,7 @@ class NeonEnvBuilder:
         # Cannot create more than one environment from one builder
         assert self.env is None, "environment already initialized"
         if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
-            self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+            self.enable_pageserver_remote_storage(default_remote_storage())
         self.env = NeonEnv(self)
         return self.env
 
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index d8361133d7..565e5fa7f8 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -372,6 +372,13 @@ def s3_storage() -> RemoteStorageKind:
         return RemoteStorageKind.MOCK_S3
 
 
+def default_remote_storage() -> RemoteStorageKind:
+    """
+    The remote storage kind used in tests that do not specify a preference
+    """
+    return RemoteStorageKind.LOCAL_FS
+
+
 # serialize as toml inline table
 def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
     if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 86a749eaf3..af2d7aae88 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -5,7 +5,6 @@ import time
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, Tuple
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,7 +18,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
     wait_for_upload_queue_empty,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
+from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
@@ -45,13 +44,7 @@ def get_num_downloaded_layers(client: PageserverHttpClient):
 # If you have a large relation, check that the pageserver downloads parts of it as
 # require by queries.
 #
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-def test_ondemand_download_large_rel(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+def test_ondemand_download_large_rel(neon_env_builder: NeonEnvBuilder):
     # thinking about using a shared environment? the test assumes that global
     # metrics are for single tenant.
     env = neon_env_builder.init_start(
@@ -145,13 +138,7 @@ def test_ondemand_download_large_rel(
 # If you have a relation with a long history of updates, the pageserver downloads the layer
 # files containing the history as needed by timetravel queries.
 #
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-def test_ondemand_download_timetravel(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     # thinking about using a shared environment? the test assumes that global
     # metrics are for single tenant.
 
@@ -229,8 +216,7 @@ def test_ondemand_download_timetravel(
     assert filled_current_physical == filled_size, "we don't yet do layer eviction"
 
     # Wait until generated image layers are uploaded to S3
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
 
     env.pageserver.stop()
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 3929f59e46..48f5682371 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -23,23 +23,18 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
     wait_until_tenant_state,
 )
-from fixtures.remote_storage import (
-    RemoteStorageKind,
-    available_remote_storages,
-    available_s3_storages,
-)
+from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
 from fixtures.types import TenantId
 from fixtures.utils import run_pg_bench_small, wait_until
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_tenant_delete_smoke(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
     pg_bin: PgBin,
 ):
     neon_env_builder.pageserver_config_override = "test_remote_failures=1"
 
+    remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start()
@@ -78,16 +73,15 @@ def test_tenant_delete_smoke(
             run_pg_bench_small(pg_bin, endpoint.connstr())
             wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
 
-            if remote_storage_kind in available_s3_storages():
-                assert_prefix_not_empty(
-                    neon_env_builder,
-                    prefix="/".join(
-                        (
-                            "tenants",
-                            str(tenant_id),
-                        )
-                    ),
-                )
+            assert_prefix_not_empty(
+                neon_env_builder,
+                prefix="/".join(
+                    (
+                        "tenants",
+                        str(tenant_id),
+                    )
+                ),
+            )
 
         parent = timeline
 
@@ -100,16 +94,15 @@ def test_tenant_delete_smoke(
     tenant_path = env.pageserver.tenant_dir(tenant_id)
     assert not tenant_path.exists()
 
-    if remote_storage_kind in available_s3_storages():
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
 
     # Deletion updates the tenant count: the one default tenant remains
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
@@ -149,9 +142,7 @@ FAILPOINTS_BEFORE_BACKGROUND = [
 def combinations():
     result = []
 
-    remotes = [RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
-        remotes.append(RemoteStorageKind.REAL_S3)
+    remotes = available_s3_storages()
 
     for remote_storage_kind in remotes:
         for delete_failpoint in FAILPOINTS:
@@ -165,8 +156,8 @@ def combinations():
     return result
 
 
-@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
 @pytest.mark.parametrize("check", list(Check))
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
 def test_delete_tenant_exercise_crash_safety_failpoints(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -214,16 +205,15 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
         run_pg_bench_small(pg_bin, endpoint.connstr())
         last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
 
-        if remote_storage_kind in available_s3_storages():
-            assert_prefix_not_empty(
-                neon_env_builder,
-                prefix="/".join(
-                    (
-                        "tenants",
-                        str(tenant_id),
-                    )
-                ),
-            )
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
 
     ps_http.configure_failpoints((failpoint, "return"))
 
@@ -276,25 +266,23 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
     assert not tenant_dir.exists()
 
     # Check remote is empty
-    if remote_storage_kind in available_s3_storages():
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-            allowed_postfix="initdb.tar.zst",
-        )
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+        allowed_postfix="initdb.tar.zst",
+    )
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_tenant_delete_is_resumed_on_attach(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
     pg_bin: PgBin,
 ):
+    remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
@@ -314,16 +302,15 @@ def test_tenant_delete_is_resumed_on_attach(
             wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
 
     # sanity check, data should be there
-    if remote_storage_kind in available_s3_storages():
-        assert_prefix_not_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
+    assert_prefix_not_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
 
     # failpoint before we remove index_part from s3
     failpoint = "timeline-delete-before-index-delete"
@@ -354,16 +341,15 @@ def test_tenant_delete_is_resumed_on_attach(
         iterations=iterations,
     )
 
-    if remote_storage_kind in available_s3_storages():
-        assert_prefix_not_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
+    assert_prefix_not_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
 
     reason = tenant_info["state"]["data"]["reason"]
     # failpoint may not be the only error in the stack
@@ -389,17 +375,16 @@ def test_tenant_delete_is_resumed_on_attach(
     tenant_path = env.pageserver.tenant_dir(tenant_id)
     assert not tenant_path.exists()
 
-    if remote_storage_kind in available_s3_storages():
-        ps_http.deletion_queue_flush(execute=True)
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
+    ps_http.deletion_queue_flush(execute=True)
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
 
 
 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 4a40982d78..5b63bd6161 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -21,7 +21,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
-    available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
@@ -59,16 +58,11 @@ class ReattachMode(str, enum.Enum):
 
 
 # Basic detach and re-attach test
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 @pytest.mark.parametrize(
     "mode",
     [ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
 )
-def test_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
-):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
     # Exercise retry code path by making all uploads and downloads fail for the
     # first time. The retries print INFO-messages to the log; we will check
     # that they are present after the test.
@@ -187,16 +181,13 @@ num_rows = 100000
 #
 # I don't know what's causing that...
 @pytest.mark.skip(reason="fixme")
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_tenant_reattach_while_busy(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
 ):
     updates_started = 0
     updates_finished = 0
     updates_to_perform = 0
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
     env = neon_env_builder.init_start()
 
     # Run random UPDATEs on test table. On failure, try again.
@@ -439,13 +430,9 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
         should not be present in pageserver's memory"
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_detach_while_attaching(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index c7ff80d675..feacdcc802 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -20,7 +20,6 @@ from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
-    available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
@@ -449,13 +448,9 @@ def test_tenant_relocation(
 # last-record LSN. We had a bug where GetPage incorrectly followed the
 # timeline to the ancestor without waiting for the missing WAL to
 # arrive.
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_emergency_relocate_with_branches_slow_replay(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
     env = neon_env_builder.init_start()
     env.pageserver.is_testing_enabled_or_skip()
     pageserver_http = env.pageserver.http_client()
@@ -603,13 +598,9 @@ def test_emergency_relocate_with_branches_slow_replay(
 # exist. Update dbir" path (2), and inserts an entry in the
 # DbDirectory with 'false' to indicate there is no PG_VERSION file.
 #
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_emergency_relocate_with_branches_createdb(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index c81be41530..22036884ee 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -18,7 +18,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import timeline_delete_wait_completed
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
+from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
@@ -281,13 +281,7 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
         assert post_detach_samples == set()
 
 
-# Check that empty tenants work with or without the remote storage
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-def test_pageserver_with_empty_tenants(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index f181e70696..b7b4e2be0b 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -11,7 +11,6 @@ import os
 from pathlib import Path
 from typing import List, Tuple
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -27,7 +26,6 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
-    available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
@@ -60,10 +58,7 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints):
     await asyncio.gather(*workers)
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+def test_tenants_many(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     # FIXME: Is this expected?
@@ -218,11 +213,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
 def test_tenant_redownloads_truncated_file_on_startup(
     neon_env_builder: NeonEnvBuilder,
 ):
-    remote_storage_kind = RemoteStorageKind.LOCAL_FS
-
-    # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+    # we store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
     env = neon_env_builder.init_start()
 
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index dab7b3879e..17113a6bc5 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -29,8 +29,7 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
-    available_remote_storages,
-    available_s3_storages,
+    s3_storage,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
@@ -142,25 +141,11 @@ DELETE_FAILPOINTS = [
 ]
 
 
-def combinations():
-    result = []
-
-    remotes = [RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
-        remotes.append(RemoteStorageKind.REAL_S3)
-
-    for remote_storage_kind in remotes:
-        for delete_failpoint in DELETE_FAILPOINTS:
-            result.append((remote_storage_kind, delete_failpoint))
-    return result
-
-
 # cover the two cases: remote storage configured vs not configured
-@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
+@pytest.mark.parametrize("failpoint", DELETE_FAILPOINTS)
 @pytest.mark.parametrize("check", list(Check))
 def test_delete_timeline_exercise_crash_safety_failpoints(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
     failpoint: str,
     check: Check,
     pg_bin: PgBin,
@@ -180,7 +165,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     7. Ensure failpoint is hit
     8. Retry or restart without the failpoint and check the result.
     """
-
+    remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start(
@@ -201,18 +186,17 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
 
         last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
 
-        if remote_storage_kind in available_s3_storages():
-            assert_prefix_not_empty(
-                neon_env_builder,
-                prefix="/".join(
-                    (
-                        "tenants",
-                        str(env.initial_tenant),
-                        "timelines",
-                        str(timeline_id),
-                    )
-                ),
-            )
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
 
     env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
     # It appears when we stopped flush loop during deletion and then pageserver is stopped
@@ -316,11 +300,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 @pytest.mark.parametrize("fill_branch", [True, False])
 def test_timeline_resurrection_on_attach(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
     fill_branch: bool,
 ):
     """
@@ -329,8 +311,6 @@ def test_timeline_resurrection_on_attach(
     Original issue: https://github.com/neondatabase/neon/issues/3560
     """
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
     ##### First start, insert data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
@@ -658,20 +638,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)
 
 
-@pytest.mark.parametrize(
-    "remote_storage_kind",
-    list(
-        filter(
-            lambda s: s in (RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3),
-            available_remote_storages(),
-        )
-    ),
-)
 def test_timeline_delete_works_for_remote_smoke(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
 
     env = neon_env_builder.init_start()
 
@@ -804,12 +774,11 @@ def test_delete_orphaned_objects(
     assert env.pageserver_remote_storage.index_path(env.initial_tenant, timeline_id).exists()
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_timeline_delete_resumed_on_attach(
     neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
     pg_bin: PgBin,
 ):
+    remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
@@ -824,18 +793,17 @@ def test_timeline_delete_resumed_on_attach(
         run_pg_bench_small(pg_bin, endpoint.connstr())
         last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
 
-        if remote_storage_kind in available_s3_storages():
-            assert_prefix_not_empty(
-                neon_env_builder,
-                prefix="/".join(
-                    (
-                        "tenants",
-                        str(env.initial_tenant),
-                        "timelines",
-                        str(timeline_id),
-                    )
-                ),
-            )
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
 
     # failpoint before we remove index_part from s3
     failpoint = "timeline-delete-during-rm"
@@ -873,18 +841,17 @@ def test_timeline_delete_resumed_on_attach(
     # failpoint may not be the only error in the stack
     assert reason.endswith(f"failpoint: {failpoint}"), reason
 
-    if remote_storage_kind in available_s3_storages():
-        assert_prefix_not_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                    "timelines",
-                    str(timeline_id),
-                )
-            ),
-        )
+    assert_prefix_not_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+                "timelines",
+                str(timeline_id),
+            )
+        ),
+    )
 
     # now we stop pageserver and remove local tenant state
     env.endpoints.stop_all()
@@ -905,15 +872,14 @@ def test_timeline_delete_resumed_on_attach(
     tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
     assert not tenant_path.exists()
 
-    if remote_storage_kind in available_s3_storages():
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(timeline_id),
-                    "timelines",
-                    str(timeline_id),
-                )
-            ),
-        )
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(timeline_id),
+                "timelines",
+                str(timeline_id),
+            )
+        ),
+    )
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index b34d2de0ba..3c40a9cb3e 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -39,10 +39,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import (
-    RemoteStorageKind,
-    available_remote_storages,
-)
+from fixtures.remote_storage import RemoteStorageKind, default_remote_storage
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
@@ -457,10 +454,9 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId,
     return sk_wal_size_mb <= target_size_mb
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+def test_wal_backup(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
-    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
 
     env = neon_env_builder.init_start()
 
@@ -503,11 +499,10 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot
     )
 
 
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
 
-    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
 
     env = neon_env_builder.init_start()
     tenant_id = env.initial_tenant

From f5b9af6ac7f8698d3bf724c7d2ef59d4a9c17e74 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 8 Dec 2023 16:27:21 +0100
Subject: [PATCH 058/202] page cache: improve eviction-related metrics (#6077)

These changes help with identifying thrashing.

The existing `pageserver_page_cache_find_victim_iters_total` is already
useful, but, it doesn't tell us how many individual find_victim() calls
are happening, only how many clock-LRU steps happened in the entire
system,
without info about whether we needed to actually evict other data vs
just scan for a long time, e.g., because the cache is large.

The changes in this PR allows us to
1. count each possible outcome separately, esp evictions
2. compute mean iterations/outcome

I don't think anyone except me was paying close attention to
`pageserver_page_cache_find_victim_iters_total` before, so,
I think the slight behavior change of also counting iterations
for the 'iters exceeded' case is fine.

refs https://github.com/neondatabase/cloud/issues/8351
refs https://github.com/neondatabase/neon/issues/5479
---
 pageserver/src/metrics.rs    | 65 +++++++++++++++++++++++++++++++-----
 pageserver/src/page_cache.rs | 24 ++++++++++---
 2 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 80113220ae..dc7f89cf3a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -285,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
     },
 });
 
+pub(crate) mod page_cache_eviction_metrics {
+    use std::num::NonZeroUsize;
+
+    use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
+    use once_cell::sync::Lazy;
+
+    #[derive(Clone, Copy)]
+    pub(crate) enum Outcome {
+        FoundSlotUnused { iters: NonZeroUsize },
+        FoundSlotEvicted { iters: NonZeroUsize },
+        ItersExceeded { iters: NonZeroUsize },
+    }
+
+    static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_page_cache_find_victim_iters_total",
+            "Counter for the number of iterations in the find_victim loop",
+            &["outcome"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_page_cache_find_victim_calls",
+            "Incremented at the end of each find_victim() call.\
+             Filter by outcome to get e.g., eviction rate.",
+            &["outcome"]
+        )
+        .unwrap()
+    });
+
+    pub(crate) fn observe(outcome: Outcome) {
+        macro_rules! dry {
+            ($label:literal, $iters:expr) => {{
+                static LABEL: &'static str = $label;
+                static ITERS_TOTAL: Lazy<IntCounter> =
+                    Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
+                static CALLS: Lazy<IntCounter> =
+                    Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
+                ITERS_TOTAL.inc_by(($iters.get()) as u64);
+                CALLS.inc();
+            }};
+        }
+        match outcome {
+            Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
+            Outcome::FoundSlotEvicted { iters } => {
+                dry!("found_evicted", iters)
+            }
+            Outcome::ItersExceeded { iters } => {
+                dry!("err_iters_exceeded", iters);
+                super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
+            }
+        }
+    }
+}
+
 pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -294,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
     .expect("failed to define a metric")
 });
 
-pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_page_cache_find_victim_iters_total",
-        "Counter for the number of iterations in the find_victim loop",
-    )
-    .expect("failed to define a metric")
-});
-
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "page_cache_errors_total",
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 0702057766..dbd85d2dcf 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -88,7 +88,11 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
+use crate::{
+    context::RequestContext,
+    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    repository::Key,
+};
 
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -897,8 +901,10 @@ impl PageCache {
                             // Note that just yielding to tokio during iteration without such
                             // priority boosting is likely counter-productive. We'd just give more opportunities
                             // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            page_cache_eviction_metrics::observe(
+                                page_cache_eviction_metrics::Outcome::ItersExceeded {
+                                    iters: iters.try_into().unwrap(),
+                                },
                             );
                             anyhow::bail!("exceeded evict iter limit");
                         }
@@ -909,8 +915,18 @@ impl PageCache {
                     // remove mapping for old buffer
                     self.remove_mapping(old_key);
                     inner.key = None;
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                } else {
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
                 }
-                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                 return Ok((slot_idx, inner));
             }
         }

From e1a564ace21b5bfb6535089fabdbad88ca42dce0 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 8 Dec 2023 16:21:17 +0000
Subject: [PATCH 059/202] proxy simplify cancellation (#5916)

## Problem

The cancellation code was confusing and error prone (as seen before in
our memory leaks).

## Summary of changes

* Use the new `TaskTracker` primitve instead of JoinSet to gracefully
wait for tasks to shutdown.
* Updated libs/utils/completion to use `TaskTracker`
* Remove `tokio::select` in favour of `futures::future::select` in a
specialised `run_until_cancelled()` helper function
---
 Cargo.lock                       |  12 ++++
 Cargo.toml                       |   2 +-
 libs/utils/src/completion.rs     |  23 +++---
 pageserver/src/bin/pageserver.rs |   2 -
 proxy/Cargo.toml                 |   2 +-
 proxy/src/bin/pg_sni_router.rs   |  79 ++++++++------------
 proxy/src/proxy.rs               | 120 ++++++++++++++++---------------
 proxy/src/serverless.rs          |  14 +++-
 workspace_hack/Cargo.toml        |   2 +-
 9 files changed, 130 insertions(+), 126 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index fcdc424636..d9489cdd97 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -44,6 +44,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -2042,6 +2048,10 @@ name = "hashbrown"
 version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
 
 [[package]]
 name = "hashlink"
@@ -5224,6 +5234,8 @@ dependencies = [
  "futures-core",
  "futures-io",
  "futures-sink",
+ "futures-util",
+ "hashbrown 0.14.0",
  "pin-project-lite",
  "tokio",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index ba8b49c0e0..ce590f3c7a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -149,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7", features = ["io"] }
+tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index e2e84dd0ee..ca6827c9b8 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,16 +1,14 @@
-use std::sync::Arc;
-
-use tokio::sync::{mpsc, Mutex};
+use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 
 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(mpsc::Sender<()>);
+pub struct Completion(TaskTrackerToken);
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
-pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+pub struct Barrier(TaskTracker);
 
 impl Default for Barrier {
     fn default() -> Self {
@@ -21,7 +19,7 @@ impl Default for Barrier {
 
 impl Barrier {
     pub async fn wait(self) {
-        self.0.lock().await.recv().await;
+        self.0.wait().await;
     }
 
     pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -33,8 +31,7 @@ impl Barrier {
 
 impl PartialEq for Barrier {
     fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
-        Arc::ptr_eq(&self.0, &other.0)
+        TaskTracker::ptr_eq(&self.0, &other.0)
     }
 }
 
@@ -42,8 +39,10 @@ impl Eq for Barrier {}
 
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
-    let (tx, rx) = mpsc::channel::<()>(1);
-    let rx = Mutex::new(rx);
-    let rx = Arc::new(rx);
-    (Completion(tx), Barrier(rx))
+    let tracker = TaskTracker::new();
+    // otherwise wait never exits
+    tracker.close();
+
+    let token = tracker.token();
+    (Completion(token), Barrier(tracker))
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 43b35c6d08..7607119dda 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -425,7 +425,6 @@ fn start_pageserver(
     let tenant_manager = Arc::new(tenant_manager);
 
     BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx;
         let shutdown_pageserver = shutdown_pageserver.clone();
         let drive_init = async move {
             // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -560,7 +559,6 @@ fn start_pageserver(
     }
 
     if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let background_jobs_barrier = background_jobs_barrier;
         let metrics_ctx = RequestContext::todo_child(
             TaskKind::MetricsCollection,
             // This task itself shouldn't download anything.
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 438190261d..c94cd55417 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -61,6 +61,7 @@ thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
+tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -77,7 +78,6 @@ postgres-protocol.workspace = true
 smol_str.workspace = true
 
 workspace_hack.workspace = true
-tokio-util.workspace = true
 
 [dev-dependencies]
 rcgen.workspace = true
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index bedbdbcc83..d48ba3a54e 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc};
 use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
+use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
@@ -20,7 +21,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use utils::{project_git_version, sentry_init::init_sentry};
 
-use tracing::{error, info, warn, Instrument};
+use tracing::{error, info, Instrument};
 
 project_git_version!(GIT_VERSION);
 
@@ -151,63 +152,39 @@ async fn task_main(
     // will be inherited by all accepted client sockets.
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
-    let mut connections = tokio::task::JoinSet::new();
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
 
-    loop {
-        tokio::select! {
-            accept_result = listener.accept() => {
-                let (socket, peer_addr) = accept_result?;
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;
 
-                let session_id = uuid::Uuid::new_v4();
-                let tls_config = Arc::clone(&tls_config);
-                let dest_suffix = Arc::clone(&dest_suffix);
+        let session_id = uuid::Uuid::new_v4();
+        let tls_config = Arc::clone(&tls_config);
+        let dest_suffix = Arc::clone(&dest_suffix);
 
-                connections.spawn(
-                    async move {
-                        socket
-                            .set_nodelay(true)
-                            .context("failed to set socket option")?;
+        connections.spawn(
+            async move {
+                socket
+                    .set_nodelay(true)
+                    .context("failed to set socket option")?;
 
-                        info!(%peer_addr, "serving");
-                        handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
-                    }
-                    .unwrap_or_else(|e| {
-                        // Acknowledge that the task has finished with an error.
-                        error!("per-client task finished with an error: {e:#}");
-                    })
-                    .instrument(tracing::info_span!("handle_client", ?session_id))
-                );
+                info!(%peer_addr, "serving");
+                handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
             }
-            // Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
-            // If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
-            // This only counts for this loop and it will be enabled again on next `select!`.
-            //
-            // Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
-            // When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
-            // not get called again, even if there are more connections to remove.
-            Some(res) = connections.join_next() => {
-                if let Err(e) = res {
-                    if !e.is_panic() && !e.is_cancelled() {
-                        warn!("unexpected error from joined connection task: {e:?}");
-                    }
-                }
-            }
-            _ = cancellation_token.cancelled() => {
-                drop(listener);
-                break;
-            }
-        }
+            .unwrap_or_else(|e| {
+                // Acknowledge that the task has finished with an error.
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(tracing::info_span!("handle_client", ?session_id)),
+        );
     }
 
-    // Drain connections
-    info!("waiting for all client connections to finish");
-    while let Some(res) = connections.join_next().await {
-        if let Err(e) = res {
-            if !e.is_panic() && !e.is_cancelled() {
-                warn!("unexpected error from joined connection task: {e:?}");
-            }
-        }
-    }
+    connections.close();
+    drop(listener);
+
+    connections.wait().await;
+
     info!("all client connections have finished");
     Ok(())
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 7cf3ed5b8a..4dbffa850a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -277,6 +277,21 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
+pub async fn run_until_cancelled<F: std::future::Future>(
+    f: F,
+    cancellation_token: &CancellationToken,
+) -> Option<F::Output> {
+    match futures::future::select(
+        std::pin::pin!(f),
+        std::pin::pin!(cancellation_token.cancelled()),
+    )
+    .await
+    {
+        futures::future::Either::Left((f, _)) => Some(f),
+        futures::future::Either::Right(((), _)) => None,
+    }
+}
+
 pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
@@ -290,71 +305,62 @@ pub async fn task_main(
     // will be inherited by all accepted client sockets.
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
-    let mut connections = tokio::task::JoinSet::new();
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
     let cancel_map = Arc::new(CancelMap::default());
 
-    loop {
-        tokio::select! {
-            accept_result = listener.accept() => {
-                let (socket, peer_addr) = accept_result?;
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;
 
-                let session_id = uuid::Uuid::new_v4();
-                let cancel_map = Arc::clone(&cancel_map);
-                connections.spawn(
-                    async move {
-                        info!("accepted postgres client connection");
+        let session_id = uuid::Uuid::new_v4();
+        let cancel_map = Arc::clone(&cancel_map);
+        connections.spawn(
+            async move {
+                info!("accepted postgres client connection");
 
-                        let mut socket = WithClientIp::new(socket);
-                        let mut peer_addr = peer_addr;
-                        if let Some(ip) = socket.wait_for_addr().await? {
-                            peer_addr = ip;
-                            tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
-                        } else if config.require_client_ip {
-                            bail!("missing required client IP");
-                        }
-
-                        socket
-                            .inner
-                            .set_nodelay(true)
-                            .context("failed to set socket option")?;
-
-                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr.ip()).await
-                    }
-                    .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
-                    .unwrap_or_else(move |e| {
-                        // Acknowledge that the task has finished with an error.
-                        error!(?session_id, "per-client task finished with an error: {e:#}");
-                    }),
-                );
-            }
-            // Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
-            // If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
-            // This only counts for this loop and it will be enabled again on next `select!`.
-            //
-            // Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
-            // When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
-            // not get called again, even if there are more connections to remove.
-            Some(res) = connections.join_next() => {
-                if let Err(e) = res {
-                    if !e.is_panic() && !e.is_cancelled() {
-                        warn!("unexpected error from joined connection task: {e:?}");
-                    }
+                let mut socket = WithClientIp::new(socket);
+                let mut peer_addr = peer_addr;
+                if let Some(ip) = socket.wait_for_addr().await? {
+                    peer_addr = ip;
+                    tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
+                } else if config.require_client_ip {
+                    bail!("missing required client IP");
                 }
+
+                socket
+                    .inner
+                    .set_nodelay(true)
+                    .context("failed to set socket option")?;
+
+                handle_client(
+                    config,
+                    &cancel_map,
+                    session_id,
+                    socket,
+                    ClientMode::Tcp,
+                    peer_addr.ip(),
+                )
+                .await
             }
-            _ = cancellation_token.cancelled() => {
-                drop(listener);
-                break;
-            }
-        }
+            .instrument(info_span!(
+                "handle_client",
+                ?session_id,
+                peer_addr = tracing::field::Empty
+            ))
+            .unwrap_or_else(move |e| {
+                // Acknowledge that the task has finished with an error.
+                error!(?session_id, "per-client task finished with an error: {e:#}");
+            }),
+        );
     }
+
+    connections.close();
+    drop(listener);
+
     // Drain connections
-    while let Some(res) = connections.join_next().await {
-        if let Err(e) = res {
-            if !e.is_panic() && !e.is_cancelled() {
-                warn!("unexpected error from joined connection task: {e:?}");
-            }
-        }
-    }
+    connections.wait().await;
+
     Ok(())
 }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 5a992d6461..cd496ff01e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,6 +10,7 @@ use anyhow::bail;
 use hyper::StatusCode;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio_util::task::TaskTracker;
 
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
@@ -70,6 +71,9 @@ pub async fn task_main(
         incoming: addr_incoming,
     };
 
+    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
+    ws_connections.close(); // allows `ws_connections.wait to complete`
+
     let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
         if let Err(err) = conn {
             error!("failed to accept TLS connection for websockets: {err:?}");
@@ -86,6 +90,7 @@ pub async fn task_main(
             let remote_addr = io.inner.remote_addr();
             let sni_name = tls.server_name().map(|s| s.to_string());
             let conn_pool = conn_pool.clone();
+            let ws_connections = ws_connections.clone();
 
             async move {
                 let peer_addr = match client_addr {
@@ -97,6 +102,7 @@ pub async fn task_main(
                     move |req: Request<Body>| {
                         let sni_name = sni_name.clone();
                         let conn_pool = conn_pool.clone();
+                        let ws_connections = ws_connections.clone();
 
                         async move {
                             let cancel_map = Arc::new(CancelMap::default());
@@ -106,6 +112,7 @@ pub async fn task_main(
                                 req,
                                 config,
                                 conn_pool,
+                                ws_connections,
                                 cancel_map,
                                 session_id,
                                 sni_name,
@@ -129,6 +136,9 @@ pub async fn task_main(
         .with_graceful_shutdown(cancellation_token.cancelled())
         .await?;
 
+    // await websocket connections
+    ws_connections.wait().await;
+
     Ok(())
 }
 
@@ -170,10 +180,12 @@ where
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 async fn request_handler(
     mut request: Request<Body>,
     config: &'static ProxyConfig,
     conn_pool: Arc<conn_pool::GlobalConnPool>,
+    ws_connections: TaskTracker,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
     sni_hostname: Option<String>,
@@ -193,7 +205,7 @@ async fn request_handler(
         let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
-        tokio::spawn(
+        ws_connections.spawn(
             async move {
                 if let Err(e) = websocket::serve_websocket(
                     websocket,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 82945dfacb..3653643d7e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -65,7 +65,7 @@ subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
-tokio-util = { version = "0.7", features = ["codec", "compat", "io"] }
+tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }

From cf024de2027052ed7c58c017763f89b792d066b4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 8 Dec 2023 18:23:50 +0100
Subject: [PATCH 060/202] virtual_file metrics: expose max size of the fd cache
 (#6078)

And also leave a comment on how to determine current size.

Kind of follow-up to #6066

refs https://github.com/neondatabase/cloud/issues/8351
refs https://github.com/neondatabase/neon/issues/5479
---
 pageserver/src/metrics.rs      | 20 ++++++++++++++++++++
 pageserver/src/virtual_file.rs |  1 +
 2 files changed, 21 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index dc7f89cf3a..67d798c1d4 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -891,6 +891,26 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     )
     .expect("failed to define a metric")
 });
+
+pub(crate) mod virtual_file_descriptor_cache {
+    use super::*;
+
+    pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
+        register_uint_gauge!(
+            "pageserver_virtual_file_descriptor_cache_size_max",
+            "Maximum number of open file descriptors in the cache."
+        )
+        .unwrap()
+    });
+
+    // SIZE_CURRENT: derive it like so:
+    // ```
+    // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
+    // -ignoring(operation)
+    // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
+    // ```
+}
+
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
     global: Histogram,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 24efd14f84..10bed7ca06 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -654,6 +654,7 @@ pub fn init(num_slots: usize) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
 
 const TEST_MAX_FILE_DESCRIPTORS: usize = 10;

From e640bc7dba325c4a7d7f0ff366118eac07f28bd1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Dec 2023 17:32:16 +0000
Subject: [PATCH 061/202] tests: allow-lists for occasional failures (#6074)

test_creating_tenant_conf_after...
- Test detaches a tenant and then re-attaches immediatel: this causes a
race between pending remote LSN update and the generation bump in the
attachment.

test_gc_cutoff:
- Test rapidly restarts a pageserver before one generation has had the
chance to process deletions from the previous generation
---
 test_runner/regress/test_gc_cutoff.py   | 5 +++++
 test_runner/regress/test_tenant_conf.py | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
index be3355f5cc..284a8c3563 100644
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -35,6 +35,11 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
     pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
 
+    # Because this test does a rapid series of restarts of the same node, it's possible that
+    # we are restarted again before we can clean up deletion lists form the previous generation,
+    # resulting in a subsequent startup logging a warning.
+    env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
+
     for _ in range(5):
         with pytest.raises(subprocess.SubprocessError):
             pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index f4565c2ee2..2ed22cabc4 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -314,6 +314,10 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     assert not config_path.exists(), "detach did not remove config file"
 
+    # The re-attach's increment of the generation number may invalidate deletion queue
+    # updates in flight from the previous attachment.
+    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
     env.pageserver.tenant_attach(tenant_id)
     wait_until(
         number_of_iterations=5,

From df1f8e13c45930f9ad53fe18076186616fed3f80 Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Fri, 8 Dec 2023 19:58:36 +0100
Subject: [PATCH 062/202] proxy: pass neon options in deep object format
 (#6068)

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 Cargo.lock                         | 33 +++++++++++++++++--------
 Cargo.toml                         |  2 +-
 proxy/src/auth/credentials.rs      |  9 +++----
 proxy/src/compute.rs               |  4 +--
 proxy/src/console/provider.rs      | 13 +++++++++-
 proxy/src/console/provider/neon.rs | 13 +++++++---
 proxy/src/proxy.rs                 | 39 +++++++++++++++---------------
 proxy/src/proxy/tests.rs           |  2 +-
 proxy/src/serverless/conn_pool.rs  |  2 +-
 workspace_hack/Cargo.toml          |  6 +++--
 10 files changed, 76 insertions(+), 47 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d9489cdd97..1364c9d84f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -896,7 +896,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
 dependencies = [
  "memchr",
  "once_cell",
- "regex-automata",
+ "regex-automata 0.1.10",
  "serde",
 ]
 
@@ -2543,7 +2543,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
 dependencies = [
- "regex-automata",
+ "regex-automata 0.1.10",
 ]
 
 [[package]]
@@ -2569,9 +2569,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "memchr"
-version = "2.5.0"
+version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "memoffset"
@@ -3820,13 +3820,14 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.8.2"
+version = "1.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax 0.7.2",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
 ]
 
 [[package]]
@@ -3838,6 +3839,17 @@ dependencies = [
  "regex-syntax 0.6.29",
 ]
 
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax 0.8.2",
+]
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.29"
@@ -3846,9 +3858,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
 name = "regex-syntax"
-version = "0.7.2"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 
 [[package]]
 name = "relative-path"
@@ -6231,7 +6243,8 @@ dependencies = [
  "prost",
  "rand 0.8.5",
  "regex",
- "regex-syntax 0.7.2",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
  "reqwest",
  "ring 0.16.20",
  "rustls",
diff --git a/Cargo.toml b/Cargo.toml
index ce590f3c7a..33f56e084f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-regex = "1.4"
+regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index dd7c58255f..72149e8e29 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -3,7 +3,7 @@
 use crate::{
     auth::password_hack::parse_endpoint_param,
     error::UserFacingError,
-    proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
+    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -140,7 +140,7 @@ impl ClientCredentials {
         let cache_key = format!(
             "{}{}",
             project.as_deref().unwrap_or(""),
-            neon_options(params).unwrap_or("".to_string())
+            neon_options_str(params)
         )
         .into();
 
@@ -406,10 +406,7 @@ mod tests {
         let peer_addr = IpAddr::from([127, 0, 0, 1]);
         let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
         assert_eq!(creds.project.as_deref(), Some("project"));
-        assert_eq!(
-            creds.cache_key,
-            "projectneon_endpoint_type:read_write neon_lsn:0/2"
-        );
+        assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
 
         Ok(())
     }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index c838c8fc38..78c56300a5 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,6 @@
 use crate::{
     auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    error::UserFacingError, proxy::is_neon_param,
+    error::UserFacingError, proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -275,7 +275,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
     let options: String = params
         .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
+        .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index ccb5cbdb92..b0a73fd03d 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -201,7 +201,18 @@ pub struct ConsoleReqExtra<'a> {
     pub session_id: uuid::Uuid,
     /// Name of client application, if set.
     pub application_name: Option<&'a str>,
-    pub options: Option<&'a str>,
+    pub options: Vec<(String, String)>,
+}
+
+impl<'a> ConsoleReqExtra<'a> {
+    // https://swagger.io/docs/specification/serialization/ DeepObject format
+    // paramName[prop1]=value1&paramName[prop2]=value2&....
+    pub fn options_as_deep_object(&self) -> Vec<(String, String)> {
+        self.options
+            .iter()
+            .map(|(k, v)| (format!("options[{}]", k), v.to_string()))
+            .collect()
+    }
 }
 
 /// Auth secret which is managed by the cloud.
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f0510e91ea..f8c3ee5b58 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -106,7 +106,7 @@ impl Api {
     ) -> Result<NodeInfo, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
         async {
-            let request = self
+            let mut request_builder = self
                 .endpoint
                 .get("proxy_wake_compute")
                 .header("X-Request-ID", &request_id)
@@ -115,9 +115,14 @@ impl Api {
                 .query(&[
                     ("application_name", extra.application_name),
                     ("project", Some(&creds.endpoint)),
-                    ("options", extra.options),
-                ])
-                .build()?;
+                ]);
+
+            request_builder = if extra.options.is_empty() {
+                request_builder
+            } else {
+                request_builder.query(&extra.options_as_deep_object())
+            };
+            let request = request_builder.build()?;
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 4dbffa850a..018f774c7e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -968,12 +968,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             allow_self_signed_compute,
         } = self;
 
-        let console_options = neon_options(params);
-
         let extra = console::ConsoleReqExtra {
             session_id, // aka this connection's id
             application_name: params.get("application_name"),
-            options: console_options.as_deref(),
+            options: neon_options(params),
         };
 
         let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -1033,26 +1031,29 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     }
 }
 
-pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
+pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> {
     #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
-        .filter(|opt| is_neon_param(opt))
-        .sorted() // we sort it to use as cache key
-        .intersperse(" ") // TODO: use impl from std once it's stabilized
-        .collect();
-
-    // Don't even bother with empty options.
-    if options.is_empty() {
-        return None;
+    match params.options_raw() {
+        Some(options) => options.filter_map(neon_option).collect(),
+        None => vec![],
     }
-
-    Some(options)
 }
 
-pub fn is_neon_param(bytes: &str) -> bool {
+pub fn neon_options_str(params: &StartupMessageParams) -> String {
+    #[allow(unstable_name_collisions)]
+    neon_options(params)
+        .iter()
+        .map(|(k, v)| format!("{}:{}", k, v))
+        .sorted() // we sort it to use as cache key
+        .intersperse(" ".to_owned())
+        .collect()
+}
+
+pub fn neon_option(bytes: &str) -> Option<(String, String)> {
     static RE: OnceCell<Regex> = OnceCell::new();
-    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
+    let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
 
-    RE.get().unwrap().is_match(bytes)
+    let cap = re.captures(bytes)?;
+    let (_, [k, v]) = cap.extract();
+    Some((k.to_owned(), v.to_owned()))
 }
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 222661db4a..31c3ad1055 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -491,7 +491,7 @@ fn helper_create_connect_info(
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
         application_name: Some("TEST"),
-        options: None,
+        options: vec![],
     };
     let creds = auth::BackendType::Test(mechanism);
     (cache, extra, creds)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index b9d1a9692d..734df11368 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -433,7 +433,7 @@ async fn connect_to_compute(
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
         application_name: Some(APP_NAME),
-        options: console_options.as_deref(),
+        options: console_options,
     };
     // TODO(anna): this is a bit hacky way, consider using console notification listener.
     if !config.disable_ip_check_for_http {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3653643d7e..4621a75c0b 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -53,7 +53,8 @@ num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
-regex-syntax = { version = "0.7" }
+regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
+regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
@@ -90,7 +91,8 @@ memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
-regex-syntax = { version = "0.7" }
+regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
+regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }

From 0ba4cae491c26c7678f7abddb68bf76134a7df90 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 8 Dec 2023 12:55:44 -0800
Subject: [PATCH 063/202] Fix RLS/REPLICATION granting (#6083)

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 compute_tools/src/compute.rs    |  2 +-
 compute_tools/src/pg_helpers.rs |  7 +------
 compute_tools/src/spec.rs       | 16 +++++++++++-----
 libs/compute_api/src/spec.rs    |  2 --
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 9969b2166c..b39a800f14 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                     IF NOT EXISTS (
                         SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                     THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
                         IF array_length(roles, 1) IS NOT NULL THEN
                             EXECUTE format('GRANT neon_superuser TO %s',
                                            array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 8722822f5e..b79e516650 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
     let postgres_roles = xact
-        .query(
-            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
-            &[],
-        )?
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
         .iter()
         .map(|row| Role {
             name: row.get("rolname"),
             encrypted_password: row.get("rolpassword"),
-            replication: Some(row.get("rolreplication")),
-            bypassrls: Some(row.get("rolbypassrls")),
             options: None,
         })
         .collect();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index f98333d8bf..ba1ee6d1b2 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -252,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
         let action = if let Some(r) = pg_role {
             if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                 || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-                || !r.bypassrls.unwrap_or(false)
-                || !r.replication.unwrap_or(false)
             {
                 RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -285,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
         match action {
             RoleAction::None => {}
             RoleAction::Update => {
-                let mut query: String =
-                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
+                // This can be run on /every/ role! Not just ones created through the console.
+                // This means that if you add some funny ALTER here that adds a permission,
+                // this will get run even on user-created roles! This will result in different
+                // behavior before and after a spec gets reapplied. The below ALTER as it stands
+                // now only grants LOGIN and changes the password. Please do not allow this branch
+                // to do anything silly.
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                 query.push_str(&role.to_pg_options());
                 xact.execute(query.as_str(), &[])?;
             }
             RoleAction::Create => {
+                // This branch only runs when roles are created through the console, so it is
+                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
+                // from neon_superuser.
                 let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index d9c384a5d3..2a483188e4 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -207,8 +207,6 @@ pub struct DeltaOp {
 pub struct Role {
     pub name: PgIdent,
     pub encrypted_password: Option<String>,
-    pub replication: Option<bool>,
-    pub bypassrls: Option<bool>,
     pub options: GenericOptions,
 }
 

From f0d15cee6f2921bfbd90d41ebdd3d11e4015396e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Dec 2023 13:21:02 +0200
Subject: [PATCH 064/202] build: update azure-* to 0.17 (#6081)

this is a drive-by upgrade while we refresh the access tokens at the
same time.
---
 Cargo.lock | 51 +++++++++++++++++++++++++++++++++++++--------------
 Cargo.toml |  8 ++++----
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1364c9d84f..da480b8e0c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -184,7 +184,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
 dependencies = [
  "concurrent-queue",
- "event-listener",
+ "event-listener 2.5.3",
  "futures-core",
 ]
 
@@ -205,11 +205,13 @@ dependencies = [
 
 [[package]]
 name = "async-lock"
-version = "2.8.0"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
+checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
 dependencies = [
- "event-listener",
+ "event-listener 4.0.0",
+ "event-listener-strategy",
+ "pin-project-lite",
 ]
 
 [[package]]
@@ -692,9 +694,9 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467"
+checksum = "4ccd63c07d1fbfb3d4543d7ea800941bf5a30db1911b9b9e4db3b2c4210a434f"
 dependencies = [
  "async-trait",
  "base64 0.21.1",
@@ -719,9 +721,9 @@ dependencies = [
 
 [[package]]
 name = "azure_identity"
-version = "0.16.2"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
+checksum = "8bd7ea32ca7eb66ff4757f83baac702ff11d469e5de365b6bc6f79f9c25d3436"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -740,9 +742,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97"
+checksum = "83ca0a07f89fd72a006da4713e93af3d6c44a693e61a1c3c2e7985de39c182e8"
 dependencies = [
  "RustyXML",
  "async-trait",
@@ -762,9 +764,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.16.0"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4"
+checksum = "8096c04d370118323c42b2752aa1883e4880a56ef65239f317b359f263b6e194"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -1686,6 +1688,27 @@ version = "2.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
 
+[[package]]
+name = "event-listener"
+version = "4.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae"
+dependencies = [
+ "concurrent-queue",
+ "parking",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "event-listener-strategy"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
+dependencies = [
+ "event-listener 4.0.0",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "fail"
 version = "0.5.1"
@@ -3678,9 +3701,9 @@ dependencies = [
 
 [[package]]
 name = "quick-xml"
-version = "0.30.0"
+version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
 dependencies = [
  "memchr",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index 33f56e084f..b5eece5e35 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,10 +38,10 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-azure_core = "0.16"
-azure_identity = "0.16"
-azure_storage = "0.16"
-azure_storage_blobs = "0.16"
+azure_core = "0.17"
+azure_identity = "0.17"
+azure_storage = "0.17"
+azure_storage_blobs = "0.17"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"

From 66a7a226f8db9a96a152bf98cc23d599a80eb554 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Dec 2023 15:13:27 +0000
Subject: [PATCH 065/202] test_runner: use toml instead of formatted strings
 (#6088)

## Problem

A bunch of refactorings extracted from
https://github.com/neondatabase/neon/pull/6087 (not required for it);
the most significant one is using toml instead of formatted strings.

## Summary of changes
- Use toml instead of formatted strings for config
- Skip pageserver log check if `pageserver.log` doesn't exist
- `chmod -x test_runner/regress/test_config.py`
---
 test_runner/fixtures/neon_fixtures.py     | 132 ++++++++++------------
 test_runner/fixtures/remote_storage.py    |  22 ++--
 test_runner/regress/test_compatibility.py |   4 +-
 test_runner/regress/test_config.py        |   0
 4 files changed, 74 insertions(+), 84 deletions(-)
 mode change 100755 => 100644 test_runner/regress/test_config.py

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9e0beeb4d1..c569b63d4e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -28,6 +28,7 @@ import jwt
 import psycopg2
 import pytest
 import requests
+import toml
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
@@ -436,7 +437,7 @@ class NeonEnvBuilder:
         # Pageserver remote storage
         self.pageserver_remote_storage = pageserver_remote_storage
         # Safekeepers remote storage
-        self.sk_remote_storage: Optional[RemoteStorage] = None
+        self.safekeepers_remote_storage: Optional[RemoteStorage] = None
 
         self.broker = broker
         self.run_id = run_id
@@ -534,9 +535,11 @@ class NeonEnvBuilder:
         self.pageserver_remote_storage = ret
 
     def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind):
-        assert self.sk_remote_storage is None, "sk_remote_storage already configured"
+        assert (
+            self.safekeepers_remote_storage is None
+        ), "safekeepers_remote_storage already configured"
 
-        self.sk_remote_storage = self._configure_and_create_remote_storage(
+        self.safekeepers_remote_storage = self._configure_and_create_remote_storage(
             kind, RemoteStorageUser.SAFEKEEPER
         )
 
@@ -589,7 +592,7 @@ class NeonEnvBuilder:
                 directory_to_clean.rmdir()
 
     def cleanup_remote_storage(self):
-        for x in [self.pageserver_remote_storage, self.sk_remote_storage]:
+        for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]:
             if isinstance(x, S3Storage):
                 x.do_cleanup()
 
@@ -693,7 +696,7 @@ class NeonEnv:
         self.pageservers: List[NeonPageserver] = []
         self.broker = config.broker
         self.pageserver_remote_storage = config.pageserver_remote_storage
-        self.safekeepers_remote_storage = config.sk_remote_storage
+        self.safekeepers_remote_storage = config.safekeepers_remote_storage
         self.pg_version = config.pg_version
         # Binary path for pageserver, safekeeper, etc
         self.neon_binpath = config.neon_binpath
@@ -718,25 +721,17 @@ class NeonEnv:
             self.attachment_service = None
 
         # Create a config file corresponding to the options
-        toml = textwrap.dedent(
-            f"""
-            default_tenant_id = '{config.initial_tenant}'
-        """
-        )
+        cfg: Dict[str, Any] = {
+            "default_tenant_id": str(self.initial_tenant),
+            "broker": {
+                "listen_addr": self.broker.listen_addr(),
+            },
+            "pageservers": [],
+            "safekeepers": [],
+        }
 
         if self.control_plane_api is not None:
-            toml += textwrap.dedent(
-                f"""
-                control_plane_api = '{self.control_plane_api}'
-            """
-            )
-
-        toml += textwrap.dedent(
-            f"""
-            [broker]
-            listen_addr = '{self.broker.listen_addr()}'
-        """
-        )
+            cfg["control_plane_api"] = self.control_plane_api
 
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -749,26 +744,24 @@ class NeonEnv:
                 http=self.port_distributor.get_port(),
             )
 
-            toml += textwrap.dedent(
-                f"""
-                [[pageservers]]
-                id={ps_id}
-                listen_pg_addr = 'localhost:{pageserver_port.pg}'
-                listen_http_addr = 'localhost:{pageserver_port.http}'
-                pg_auth_type = '{pg_auth_type}'
-                http_auth_type = '{http_auth_type}'
-            """
-            )
-
+            ps_cfg: Dict[str, Any] = {
+                "id": ps_id,
+                "listen_pg_addr": f"localhost:{pageserver_port.pg}",
+                "listen_http_addr": f"localhost:{pageserver_port.http}",
+                "pg_auth_type": pg_auth_type,
+                "http_auth_type": http_auth_type,
+            }
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
                     self,
                     ps_id,
                     port=pageserver_port,
-                    config_override=config.pageserver_config_override,
+                    config_override=self.pageserver_config_override,
                 )
             )
+            cfg["pageservers"].append(ps_cfg)
+
         # Create config and a Safekeeper object for each safekeeper
         for i in range(1, config.num_safekeepers + 1):
             port = SafekeeperPort(
@@ -777,32 +770,22 @@ class NeonEnv:
                 http=self.port_distributor.get_port(),
             )
             id = config.safekeepers_id_start + i  # assign ids sequentially
-            toml += textwrap.dedent(
-                f"""
-                [[safekeepers]]
-                id = {id}
-                pg_port = {port.pg}
-                pg_tenant_only_port = {port.pg_tenant_only}
-                http_port = {port.http}
-                sync = {'true' if config.safekeepers_enable_fsync else 'false'}"""
-            )
+            sk_cfg: Dict[str, Any] = {
+                "id": id,
+                "pg_port": port.pg,
+                "pg_tenant_only_port": port.pg_tenant_only,
+                "http_port": port.http,
+                "sync": config.safekeepers_enable_fsync,
+            }
             if config.auth_enabled:
-                toml += textwrap.dedent(
-                    """
-                auth_enabled = true
-                """
-                )
-            if config.sk_remote_storage is not None:
-                toml += textwrap.dedent(
-                    f"""
-                remote_storage = "{remote_storage_to_toml_inline_table(config.sk_remote_storage)}"
-                """
-                )
-            safekeeper = Safekeeper(env=self, id=id, port=port)
-            self.safekeepers.append(safekeeper)
+                sk_cfg["auth_enabled"] = True
+            if self.safekeepers_remote_storage is not None:
+                sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table()
+            self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
+            cfg["safekeepers"].append(sk_cfg)
 
-        log.info(f"Config: {toml}")
-        self.neon_cli.init(toml)
+        log.info(f"Config: {cfg}")
+        self.neon_cli.init(cfg)
 
     def start(self):
         # Start up broker, pageserver and all safekeepers
@@ -1288,10 +1271,10 @@ class NeonCli(AbstractNeonCli):
 
     def init(
         self,
-        config_toml: str,
+        config: Dict[str, Any],
     ) -> "subprocess.CompletedProcess[str]":
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
-            tmp.write(config_toml)
+            tmp.write(toml.dumps(config))
             tmp.flush()
 
             cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
@@ -1732,8 +1715,13 @@ class NeonPageserver(PgProtocol):
         return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}"))
 
     def assert_no_errors(self):
-        logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
-        errors = scan_pageserver_log_for_errors(logfile, self.allowed_errors)
+        logfile = self.workdir / "pageserver.log"
+        if not logfile.exists():
+            log.warning(f"Skipping log check: {logfile} does not exist")
+            return
+
+        with logfile.open("r") as f:
+            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
 
         for _lineno, error in errors:
             log.info(f"not allowed error: {error.strip()}")
@@ -1757,7 +1745,10 @@ class NeonPageserver(PgProtocol):
 
     def log_contains(self, pattern: str) -> Optional[str]:
         """Check that the pageserver log contains a line that matches the given regex"""
-        logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
+        logfile = self.workdir / "pageserver.log"
+        if not logfile.exists():
+            log.warning(f"Skipping log check: {logfile} does not exist")
+            return None
 
         contains_re = re.compile(pattern)
 
@@ -1766,14 +1757,11 @@ class NeonPageserver(PgProtocol):
         # no guarantee it is already present in the log file. This hasn't
         # been a problem in practice, our python tests are not fast enough
         # to hit that race condition.
-        while True:
-            line = logfile.readline()
-            if not line:
-                break
-
-            if contains_re.search(line):
-                # found it!
-                return line
+        with logfile.open("r") as f:
+            for line in f:
+                if contains_re.search(line):
+                    # found it!
+                    return line
 
         return None
 
@@ -3355,8 +3343,6 @@ def parse_project_git_version_output(s: str) -> str:
 
     The information is generated by utils::project_git_version!
     """
-    import re
-
     res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s)
     if res and (commit := res.group(2)):
         return commit
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 565e5fa7f8..824531bea4 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -9,6 +9,7 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 import boto3
+import toml
 from mypy_boto3_s3 import S3Client
 
 from fixtures.log_helper import log
@@ -133,7 +134,10 @@ class LocalFsStorage:
             return json.load(f)
 
     def to_toml_inline_table(self) -> str:
-        return f"local_path='{self.root}'"
+        rv = {
+            "local_path": str(self.root),
+        }
+        return toml.TomlEncoder().dump_inline_table(rv)
 
     def cleanup(self):
         # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files
@@ -174,18 +178,18 @@ class S3Storage:
         )
 
     def to_toml_inline_table(self) -> str:
-        s = [
-            f"bucket_name='{self.bucket_name}'",
-            f"bucket_region='{self.bucket_region}'",
-        ]
+        rv = {
+            "bucket_name": self.bucket_name,
+            "bucket_region": self.bucket_region,
+        }
 
         if self.prefix_in_bucket is not None:
-            s.append(f"prefix_in_bucket='{self.prefix_in_bucket}'")
+            rv["prefix_in_bucket"] = self.prefix_in_bucket
 
         if self.endpoint is not None:
-            s.append(f"endpoint='{self.endpoint}'")
+            rv["endpoint"] = self.endpoint
 
-        return ",".join(s)
+        return toml.TomlEncoder().dump_inline_table(rv)
 
     def do_cleanup(self):
         if not self.cleanup:
@@ -384,4 +388,4 @@ def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
     if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
         raise Exception("invalid remote storage type")
 
-    return f"{{{remote_storage.to_toml_inline_table()}}}"
+    return remote_storage.to_toml_inline_table()
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index f3c6af4427..35963c0d41 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,7 +7,7 @@ from pathlib import Path
 from typing import Any, List, Optional
 
 import pytest
-import toml  # TODO: replace with tomllib for Python >= 3.11
+import toml
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonCli,
@@ -411,7 +411,7 @@ def check_neon_works(
     config.initial_tenant = snapshot_config["default_tenant_id"]
     config.pg_distrib_dir = pg_distrib_dir
     config.remote_storage = None
-    config.sk_remote_storage = None
+    config.safekeepers_remote_storage = None
 
     # Use the "target" binaries to launch the storage nodes
     config_target = config
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
old mode 100755
new mode 100644

From f1fc1fd639a2b061c6f1b5c2cb116eb3f4741d5b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 11 Dec 2023 15:52:33 +0000
Subject: [PATCH 066/202] pageserver: further refactoring from TenantId to
 TenantShardId (#6059)

## Problem

In https://github.com/neondatabase/neon/pull/5957, the most essential
types were updated to use TenantShardId rather than TenantId. That
unblocked other work, but didn't fully enable running multiple shards
from the same tenant on the same pageserver.

## Summary of changes

- Use TenantShardId in page cache key for materialized pages
- Update mgr.rs get_tenant() and list_tenants() functions to use a shard
id, and update all callers.
- Eliminate the exactly_one_or_none helper in mgr.rs and all code that
used it
- Convert timeline HTTP routes to use tenant_shard_id

Note on page cache:
```
struct MaterializedPageHashKey {
    /// Why is this TenantShardId rather than TenantId?
    ///
    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
    /// special-cased in some other way.
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    key: Key,
}
```
---
 control_plane/src/bin/neon_local.rs           |   2 +-
 control_plane/src/tenant_migration.rs         |   2 +-
 libs/pageserver_api/src/models.rs             |   8 +-
 libs/pageserver_api/src/shard.rs              |   5 +
 pageserver/src/consumption_metrics.rs         |  14 +-
 pageserver/src/consumption_metrics/metrics.rs |  10 +-
 pageserver/src/http/routes.rs                 | 222 ++++++++++--------
 pageserver/src/metrics.rs                     |  38 ++-
 pageserver/src/page_cache.rs                  |  24 +-
 pageserver/src/task_mgr.rs                    |  41 ++--
 pageserver/src/tenant.rs                      |   4 +-
 pageserver/src/tenant/delete.rs               |   4 +-
 pageserver/src/tenant/mgr.rs                  | 104 +++-----
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   2 +-
 pageserver/src/tenant/tasks.rs                |  14 +-
 pageserver/src/tenant/timeline.rs             |  49 ++--
 pageserver/src/tenant/timeline/delete.rs      |   6 +-
 .../src/tenant/timeline/eviction_task.rs      |   4 +-
 pageserver/src/tenant/timeline/walreceiver.rs |  21 +-
 .../walreceiver/walreceiver_connection.rs     |   2 +-
 21 files changed, 299 insertions(+), 279 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 8d53a6a658..6f0b929ac6 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -168,7 +168,7 @@ fn print_timelines_tree(
                     info: t.clone(),
                     children: BTreeSet::new(),
                     name: timeline_name_mappings
-                        .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
+                        .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
                 },
             )
         })
diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs
index c0c44e279f..fbb0358158 100644
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -165,7 +165,7 @@ pub fn migrate_tenant(
         let found = other_ps_tenants
             .into_iter()
             .map(|t| t.id)
-            .any(|i| i == tenant_id);
+            .any(|i| i.tenant_id == tenant_id);
         if !found {
             continue;
         }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 495a58e865..2572bcf74f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -357,7 +357,7 @@ pub enum TenantAttachmentStatus {
 
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    pub id: TenantId,
+    pub id: TenantShardId,
     // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
     pub state: TenantState,
     /// Sum of the size of all layer files.
@@ -369,7 +369,7 @@ pub struct TenantInfo {
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    pub tenant_id: TenantId,
+    pub tenant_id: TenantShardId,
     pub timeline_id: TimelineId,
 
     pub ancestor_timeline_id: Option<TimelineId>,
@@ -823,7 +823,7 @@ mod tests {
     fn test_tenantinfo_serde() {
         // Test serialization/deserialization of TenantInfo
         let original_active = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
             state: TenantState::Active,
             current_physical_size: Some(42),
             attachment_status: TenantAttachmentStatus::Attached,
@@ -840,7 +840,7 @@ mod tests {
         });
 
         let original_broken = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
             state: TenantState::Broken {
                 reason: "reason".into(),
                 backtrace: "backtrace info".into(),
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 9e83e0eee2..052fbd1402 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -76,6 +76,11 @@ impl TenantShardId {
     pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
         ShardSlug(self)
     }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
 }
 
 /// Formatting helper
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 7ad6a0f890..bb13bdd5e5 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -269,12 +269,18 @@ async fn calculate_synthetic_size_worker(
             }
         };
 
-        for (tenant_id, tenant_state) in tenants {
+        for (tenant_shard_id, tenant_state) in tenants {
             if tenant_state != TenantState::Active {
                 continue;
             }
 
-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if !tenant_shard_id.is_zero() {
+                // We only send consumption metrics from shard 0, so don't waste time calculating
+                // synthetic size on other shards.
+                continue;
+            }
+
+            if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) {
                 // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                 // We can put in some prioritization for consumption metrics.
                 // Same for the loop that fetches computed metrics.
@@ -286,7 +292,9 @@ async fn calculate_synthetic_size_worker(
                     {
                         return Ok(());
                     }
-                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
+                    error!(
+                        "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
+                    );
                 }
             }
         }
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 918e45ea9e..0b827816bc 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -2,7 +2,6 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use pageserver_api::shard::ShardNumber;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
     id::{TenantId, TimelineId},
@@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics(
     };
 
     let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
-        if state != TenantState::Active {
+        if state != TenantState::Active || !id.is_zero() {
             None
         } else {
             crate::tenant::mgr::get_tenant(id, true)
                 .ok()
-                .map(|tenant| (id, tenant))
+                .map(|tenant| (id.tenant_id, tenant))
         }
     });
 
@@ -229,11 +228,6 @@ where
     while let Some((tenant_id, tenant)) = tenants.next().await {
         let mut tenant_resident_size = 0;
 
-        // Sharded tenants report all consumption metrics from shard zero
-        if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
-            continue;
-        }
-
         for timeline in tenant.list_timelines() {
             let timeline_id = timeline.timeline_id;
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9e41d912c2..b9b0250671 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -352,8 +352,7 @@ async fn build_timeline_info_common(
     let walreceiver_status = timeline.walreceiver_status();
 
     let info = TimelineInfo {
-        // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
-        tenant_id: timeline.tenant_shard_id.tenant_id,
+        tenant_id: timeline.tenant_shard_id,
         timeline_id: timeline.timeline_id,
         ancestor_timeline_id,
         ancestor_lsn,
@@ -480,15 +479,15 @@ async fn timeline_list_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let include_non_incremental_logical_size: Option<bool> =
         parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
         let timelines = tenant.list_timelines();
 
         let mut response_data = Vec::with_capacity(timelines.len());
@@ -507,7 +506,9 @@ async fn timeline_list_handler(
         }
         Ok::<Vec<TimelineInfo>, ApiError>(response_data)
     }
-    .instrument(info_span!("timeline_list", %tenant_id))
+    .instrument(info_span!("timeline_list", 
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()))
     .await?;
 
     json_response(StatusCode::OK, response_data)
@@ -517,17 +518,17 @@ async fn timeline_detail_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let include_non_incremental_logical_size: Option<bool> =
         parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     // Logical size calculation needs downloading.
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
     let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
 
         let timeline = tenant
             .get_timeline(timeline_id, false)
@@ -544,7 +545,10 @@ async fn timeline_detail_handler(
 
         Ok::<_, ApiError>(timeline_info)
     }
-    .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_detail", 
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
     .await?;
 
     json_response(StatusCode::OK, timeline_info)
@@ -554,8 +558,15 @@ async fn get_lsn_by_timestamp_handler(
     request: Request<Body>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    if !tenant_shard_id.is_zero() {
+        // Requires SLRU contents, which are only stored on shard zero
+        return Err(ApiError::BadRequest(anyhow!(
+            "Size calculations are only available on shard zero"
+        )));
+    }
 
     let version: Option<u8> = parse_query_param(&request, "version")?;
 
@@ -567,7 +578,7 @@ async fn get_lsn_by_timestamp_handler(
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
@@ -602,8 +613,15 @@ async fn get_timestamp_of_lsn_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    if !tenant_shard_id.is_zero() {
+        // Requires SLRU contents, which are only stored on shard zero
+        return Err(ApiError::BadRequest(anyhow!(
+            "Size calculations are only available on shard zero"
+        )));
+    }
 
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
 
@@ -613,7 +631,7 @@ async fn get_timestamp_of_lsn_handler(
         .map_err(ApiError::BadRequest)?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
 
     match result {
@@ -805,11 +823,11 @@ async fn tenant_status(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
 
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
@@ -819,13 +837,15 @@ async fn tenant_status(
 
         let state = tenant.current_state();
         Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_id,
+            id: tenant_shard_id,
             state: state.clone(),
             current_physical_size: Some(current_physical_size),
             attachment_status: state.attachment_status(),
         })
     }
-    .instrument(info_span!("tenant_status_handler", %tenant_id))
+    .instrument(info_span!("tenant_status_handler", 
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()))
     .await?;
 
     json_response(StatusCode::OK, tenant_info)
@@ -868,14 +888,20 @@ async fn tenant_size_handler(
     request: Request<Body>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
     let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
     let headers = request.headers();
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+
+    if !tenant_shard_id.is_zero() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Size calculations are only available on shard zero"
+        )));
+    }
 
     // this can be long operation
     let inputs = tenant
@@ -927,7 +953,7 @@ async fn tenant_size_handler(
     json_response(
         StatusCode::OK,
         TenantHistorySize {
-            id: tenant_id,
+            id: tenant_shard_id.tenant_id,
             size: sizes.as_ref().map(|x| x.total_size),
             segment_sizes: sizes.map(|x| x.segments),
             inputs,
@@ -939,14 +965,14 @@ async fn layer_map_info_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let reset: LayerAccessStatsReset =
         parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
 
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     let layer_map_info = timeline.layer_map_info(reset).await;
 
     json_response(StatusCode::OK, layer_map_info)
@@ -956,13 +982,12 @@ async fn layer_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     let downloaded = timeline
         .download_layer(layer_file_name)
         .await
@@ -973,7 +998,7 @@ async fn layer_download_handler(
         Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
         None => json_response(
             StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
         ),
     }
 }
@@ -982,12 +1007,12 @@ async fn evict_timeline_layer_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     let evicted = timeline
         .evict_layer(layer_file_name)
         .await
@@ -998,7 +1023,7 @@ async fn evict_timeline_layer_handler(
         Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
         None => json_response(
             StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
         ),
     }
 }
@@ -1130,10 +1155,10 @@ async fn get_tenant_config_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, false)?;
 
     let response = HashMap::from([
         (
@@ -1227,9 +1252,9 @@ async fn handle_tenant_break(
     r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
 
-    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
         .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
 
     tenant.set_broken("broken from test".to_owned()).await;
@@ -1270,14 +1295,15 @@ async fn timeline_gc_handler(
     mut request: Request<Body>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done =
+        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -1292,9 +1318,9 @@ async fn timeline_compact_handler(
     request: Request<Body>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let mut flags = EnumSet::empty();
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1302,14 +1328,14 @@ async fn timeline_compact_handler(
     }
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
         timeline
             .compact(&cancel, flags, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
         json_response(StatusCode::OK, ())
     }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
@@ -1318,9 +1344,9 @@ async fn timeline_checkpoint_handler(
     request: Request<Body>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let mut flags = EnumSet::empty();
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1328,7 +1354,7 @@ async fn timeline_checkpoint_handler(
     }
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
         timeline
             .freeze_and_flush()
             .await
@@ -1340,7 +1366,7 @@ async fn timeline_checkpoint_handler(
 
         json_response(StatusCode::OK, ())
     }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
@@ -1348,12 +1374,12 @@ async fn timeline_download_remote_layers_handler_post(
     mut request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     match timeline.spawn_download_all_remote_layers(body).await {
         Ok(st) => json_response(StatusCode::ACCEPTED, st),
         Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1364,11 +1390,11 @@ async fn timeline_download_remote_layers_handler_get(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
 
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
     let info = timeline
         .get_download_all_remote_layers_task_info()
         .context("task never started since last pageserver process start")
@@ -1414,9 +1440,9 @@ async fn getpage_at_lsn_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     struct Key(crate::repository::Key);
 
@@ -1435,7 +1461,7 @@ async fn getpage_at_lsn_handler(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
 
         let page = timeline.get(key.0, lsn, &ctx).await?;
 
@@ -1447,7 +1473,7 @@ async fn getpage_at_lsn_handler(
                 .unwrap(),
         )
     }
-    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
@@ -1455,9 +1481,9 @@ async fn timeline_collect_keyspace(
     request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     struct Partitioning {
         keys: crate::keyspace::KeySpace,
@@ -1526,7 +1552,7 @@ async fn timeline_collect_keyspace(
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
         let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
         let keys = timeline
             .collect_keyspace(at_lsn, &ctx)
@@ -1535,15 +1561,15 @@ async fn timeline_collect_keyspace(
 
         json_response(StatusCode::OK, Partitioning { keys, at_lsn })
     }
-    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
 async fn active_timeline_of_active_tenant(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
     tenant
         .get_timeline(timeline_id, true)
         .map_err(|e| ApiError::NotFound(e.into()))
@@ -1820,23 +1846,25 @@ pub fn make_router(
         })
         .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
         .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
-        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .get("/v1/tenant/:tenant_shard_id", |r| {
+            api_handler(r, tenant_status)
+        })
         .delete("/v1/tenant/:tenant_shard_id", |r| {
             api_handler(r, tenant_delete_handler)
         })
-        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
+        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
             api_handler(r, tenant_size_handler)
         })
         .put("/v1/tenant/config", |r| {
             api_handler(r, update_tenant_config_handler)
         })
-        .get("/v1/tenant/:tenant_id/config", |r| {
+        .get("/v1/tenant/:tenant_shard_id/config", |r| {
             api_handler(r, get_tenant_config_handler)
         })
         .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
             api_handler(r, put_tenant_location_config_handler)
         })
-        .get("/v1/tenant/:tenant_id/timeline", |r| {
+        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_list_handler)
         })
         .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
@@ -1857,47 +1885,50 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_id/ignore", |r| {
             api_handler(r, tenant_ignore_handler)
         })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_detail_handler)
         })
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
             |r| api_handler(r, get_lsn_by_timestamp_handler),
         )
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
             |r| api_handler(r, get_timestamp_of_lsn_handler),
         )
-        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            api_handler(r, timeline_gc_handler)
-        })
-        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
-            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
-        })
         .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
+            |r| api_handler(r, timeline_gc_handler),
+        )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
+        )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
             |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
         )
         .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
             |r| api_handler(r, timeline_download_remote_layers_handler_post),
         )
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
             |r| api_handler(r, timeline_download_remote_layers_handler_get),
         )
         .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_delete_handler)
         })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            api_handler(r, layer_map_info_handler)
-        })
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
+            |r| api_handler(r, layer_map_info_handler),
+        )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, layer_download_handler),
         )
         .delete(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, evict_timeline_layer_handler),
         )
         .put("/v1/disk_usage_eviction/run", |r| {
@@ -1906,18 +1937,19 @@ pub fn make_router(
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
-        .put("/v1/tenant/:tenant_id/break", |r| {
+        .put("/v1/tenant/:tenant_shard_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
         .get("/v1/panic", |r| api_handler(r, always_panic_handler))
         .post("/v1/tracing/event", |r| {
             testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
         })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
-            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
-        })
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
+            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+        )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
             |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
         )
         .any(handler_404))
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 67d798c1d4..7cc0333ee5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -650,7 +650,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
         "pageserver_evictions_with_low_residence_duration",
         "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
          Residence duration is determined using the `residence_duration_data_source`.",
-        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+        &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
     )
     .expect("failed to define a metric")
 });
@@ -714,10 +714,16 @@ impl EvictionsWithLowResidenceDurationBuilder {
         }
     }
 
-    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
+    fn build(
+        &self,
+        tenant_id: &str,
+        shard_id: &str,
+        timeline_id: &str,
+    ) -> EvictionsWithLowResidenceDuration {
         let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
             .get_metric_with_label_values(&[
                 tenant_id,
+                shard_id,
                 timeline_id,
                 self.data_source,
                 &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -748,21 +754,24 @@ impl EvictionsWithLowResidenceDuration {
     pub fn change_threshold(
         &mut self,
         tenant_id: &str,
+        shard_id: &str,
         timeline_id: &str,
         new_threshold: Duration,
     ) {
         if new_threshold == self.threshold {
             return;
         }
-        let mut with_new =
-            EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
-                .build(tenant_id, timeline_id);
+        let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
+            self.data_source,
+            new_threshold,
+        )
+        .build(tenant_id, shard_id, timeline_id);
         std::mem::swap(self, &mut with_new);
-        with_new.remove(tenant_id, timeline_id);
+        with_new.remove(tenant_id, shard_id, timeline_id);
     }
 
     // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
-    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
+    fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
         let Some(_counter) = self.counter.take() else {
             return;
         };
@@ -771,6 +780,7 @@ impl EvictionsWithLowResidenceDuration {
 
         let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
             tenant_id,
+            shard_id,
             timeline_id,
             self.data_source,
             &threshold,
@@ -1603,6 +1613,7 @@ impl StorageTimeMetrics {
 #[derive(Debug)]
 pub struct TimelineMetrics {
     tenant_id: String,
+    shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
     pub compact_time_histo: StorageTimeMetrics,
@@ -1623,11 +1634,12 @@ pub struct TimelineMetrics {
 
 impl TimelineMetrics {
     pub fn new(
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
         timeline_id: &TimelineId,
         evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
     ) -> Self {
-        let tenant_id = tenant_id.to_string();
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
         let flush_time_histo =
             StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1664,11 +1676,12 @@ impl TimelineMetrics {
         let evictions = EVICTIONS
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let evictions_with_low_residence_duration =
-            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
+        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
+            .build(&tenant_id, &shard_id, &timeline_id);
 
         TimelineMetrics {
             tenant_id,
+            shard_id,
             timeline_id,
             flush_time_histo,
             compact_time_histo,
@@ -1714,6 +1727,7 @@ impl Drop for TimelineMetrics {
     fn drop(&mut self) {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
+        let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1727,7 +1741,7 @@ impl Drop for TimelineMetrics {
         self.evictions_with_low_residence_duration
             .write()
             .unwrap()
-            .remove(tenant_id, timeline_id);
+            .remove(tenant_id, shard_id, timeline_id);
 
         // The following metrics are born outside of the TimelineMetrics lifecycle but still
         // removed at the end of it. The idea is to have the metrics outlive the
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index dbd85d2dcf..c3c98af406 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -28,7 +28,7 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
-//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
 //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
 //!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,10 +83,8 @@ use std::{
 
 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use pageserver_api::shard::TenantShardId;
+use utils::{id::TimelineId, lsn::Lsn};
 
 use crate::{
     context::RequestContext,
@@ -154,7 +152,13 @@ enum CacheKey {
 
 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
 struct MaterializedPageHashKey {
-    tenant_id: TenantId,
+    /// Why is this TenantShardId rather than TenantId?
+    ///
+    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
+    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
+    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
+    /// special-cased in some other way.
+    tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     key: Key,
 }
@@ -378,7 +382,7 @@ impl PageCache {
     /// returned page.
     pub async fn lookup_materialized_page(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         key: &Key,
         lsn: Lsn,
@@ -395,7 +399,7 @@ impl PageCache {
 
         let mut cache_key = CacheKey::MaterializedPage {
             hash_key: MaterializedPageHashKey {
-                tenant_id,
+                tenant_shard_id,
                 timeline_id,
                 key: *key,
             },
@@ -436,7 +440,7 @@ impl PageCache {
     ///
     pub async fn memorize_materialized_page(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         key: Key,
         lsn: Lsn,
@@ -444,7 +448,7 @@ impl PageCache {
     ) -> anyhow::Result<()> {
         let cache_key = CacheKey::MaterializedPage {
             hash_key: MaterializedPageHashKey {
-                tenant_id,
+                tenant_shard_id,
                 timeline_id,
                 key,
             },
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 4270b6edb0..5786356720 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 
 use futures::FutureExt;
+use pageserver_api::shard::TenantShardId;
 use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
@@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn};
 
 use once_cell::sync::Lazy;
 
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 
 use crate::shutdown_pageserver;
 
@@ -317,7 +318,7 @@ struct PageServerTask {
 
     /// Tasks may optionally be launched for a particular tenant/timeline, enabling
     /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
 
     mutable: Mutex<MutableTaskState>,
@@ -329,7 +330,7 @@ struct PageServerTask {
 pub fn spawn<F>(
     runtime: &tokio::runtime::Handle,
     kind: TaskKind,
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
     name: &str,
     shutdown_process_on_error: bool,
@@ -345,7 +346,7 @@ where
         kind,
         name: name.to_string(),
         cancel: cancel.clone(),
-        tenant_id,
+        tenant_shard_id,
         timeline_id,
         mutable: Mutex::new(MutableTaskState { join_handle: None }),
     });
@@ -424,28 +425,28 @@ async fn task_finish(
             Ok(Err(err)) => {
                 if shutdown_process_on_error {
                     error!(
-                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                     );
                     shutdown_process = true;
                 } else {
                     error!(
-                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                     );
                 }
             }
             Err(err) => {
                 if shutdown_process_on_error {
                     error!(
-                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                     );
                     shutdown_process = true;
                 } else {
                     error!(
-                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                     );
                 }
             }
@@ -467,11 +468,11 @@ async fn task_finish(
 ///
 /// Or to shut down all tasks for given timeline:
 ///
-///   shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
+///   shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
 ///
 pub async fn shutdown_tasks(
     kind: Option<TaskKind>,
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
 ) {
     let mut victim_tasks = Vec::new();
@@ -480,35 +481,35 @@ pub async fn shutdown_tasks(
         let tasks = TASKS.lock().unwrap();
         for task in tasks.values() {
             if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task.tenant_id == tenant_id)
+                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
                 && (timeline_id.is_none() || task.timeline_id == timeline_id)
             {
                 task.cancel.cancel();
                 victim_tasks.push((
                     Arc::clone(task),
                     task.kind,
-                    task.tenant_id,
+                    task.tenant_shard_id,
                     task.timeline_id,
                 ));
             }
         }
     }
 
-    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
+    let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
 
-    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
+    for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
         let join_handle = {
             let mut task_mut = task.mutable.lock().unwrap();
             task_mut.join_handle.take()
         };
         if let Some(mut join_handle) = join_handle {
             if log_all {
-                if tenant_id.is_none() {
+                if tenant_shard_id.is_none() {
                     // there are quite few of these
                     info!(name = task.name, kind = ?task_kind, "stopping global task");
                 } else {
                     // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                 }
             }
             if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 48f71d7747..a8e8b4cbfa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -608,7 +608,7 @@ impl Tenant {
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
-            Some(tenant_shard_id.tenant_id),
+            Some(tenant_shard_id),
             None,
             "attach tenant",
             false,
@@ -1917,7 +1917,7 @@ impl Tenant {
         //
         // this will additionally shutdown and await all timeline tasks.
         tracing::debug!("Waiting for tasks...");
-        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await;
+        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
 
         // Wait for any in-flight operations to complete
         self.gate.close().await;
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b8d6d0a321..acd311ace6 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -463,7 +463,7 @@ impl DeleteTenantFlow {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id.tenant_id),
+            Some(tenant_shard_id),
             None,
             "tenant_delete",
             false,
@@ -550,7 +550,7 @@ impl DeleteTenantFlow {
                 // we encounter an InProgress marker, yield the barrier it contains and wait on it.
                 let barrier = {
                     let mut locked = tenants.write().unwrap();
-                    let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
+                    let removed = locked.remove(tenant.tenant_shard_id);
 
                     // FIXME: we should not be modifying this from outside of mgr.rs.
                     // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 8466fe7fca..4d7bd4259f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -98,33 +98,6 @@ pub(crate) enum TenantsMap {
     ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
 
-/// Helper for mapping shard-unaware functions to a sharding-aware map
-/// TODO(sharding): all users of this must be made shard-aware.
-fn exactly_one_or_none<'a>(
-    map: &'a BTreeMap<TenantShardId, TenantSlot>,
-    tenant_id: &TenantId,
-) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
-    let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
-
-    // Retrieve the first two slots in the range: if both are populated, we must panic because the caller
-    // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
-    let slot_a = slots.next();
-    let slot_b = slots.next();
-    match (slot_a, slot_b) {
-        (None, None) => None,
-        (Some(slot), None) => {
-            // Exactly one matching slot
-            Some(slot)
-        }
-        (Some(_slot_a), Some(_slot_b)) => {
-            // Multiple shards for this tenant: cannot handle this yet.
-            // TODO(sharding): callers of get() should be shard-aware.
-            todo!("Attaching multiple shards in teh same tenant to the same pageserver")
-        }
-        (None, Some(_)) => unreachable!(),
-    }
-}
-
 pub(crate) enum TenantsMapRemoveResult {
     Occupied(TenantSlot),
     Vacant,
@@ -147,12 +120,11 @@ impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
     /// None is returned.
-    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
         match self {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                // TODO(sharding): callers of get() should be shard-aware.
-                exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
+                m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
             }
         }
     }
@@ -204,25 +176,19 @@ impl TenantsMap {
     ///
     /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
     /// slot if the enclosed tenant is shutdown.
-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
+    pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
         use std::collections::btree_map::Entry;
         match self {
             TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
-                match key {
-                    Some(key) => match m.entry(key) {
-                        Entry::Occupied(entry) => match entry.get() {
-                            TenantSlot::InProgress(barrier) => {
-                                TenantsMapRemoveResult::InProgress(barrier.clone())
-                            }
-                            _ => TenantsMapRemoveResult::Occupied(entry.remove()),
-                        },
-                        Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
-                    },
-                    None => TenantsMapRemoveResult::Vacant,
-                }
-            }
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
+                Entry::Occupied(entry) => match entry.get() {
+                    TenantSlot::InProgress(barrier) => {
+                        TenantsMapRemoveResult::InProgress(barrier.clone())
+                    }
+                    _ => TenantsMapRemoveResult::Occupied(entry.remove()),
+                },
+                Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
+            },
         }
     }
 
@@ -822,14 +788,16 @@ pub(crate) async fn set_new_tenant_config(
     new_tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
 ) -> Result<(), SetNewTenantConfigError> {
+    // Legacy API: does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
     info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_id, true)?;
+    let tenant = get_tenant(tenant_shard_id, true)?;
 
     // This is a legacy API that only operates on attached tenants: the preferred
     // API to use is the location_config/ endpoint, which lets the caller provide
     // the full LocationConf.
     let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
 
     Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
         .await
@@ -1143,14 +1111,11 @@ pub(crate) enum GetTenantError {
 ///
 /// This method is cancel-safe.
 pub(crate) fn get_tenant(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
     let locked = TENANTS.read().unwrap();
 
-    // TODO(sharding): make all callers of get_tenant shard-aware
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
     let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
 
     match peek_slot {
@@ -1162,14 +1127,18 @@ pub(crate) fn get_tenant(
             TenantState::Active => Ok(Arc::clone(tenant)),
             _ => {
                 if active_only {
-                    Err(GetTenantError::NotActive(tenant_id))
+                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
                 } else {
                     Ok(Arc::clone(tenant))
                 }
             }
         },
-        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
-        None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
+        Some(TenantSlot::InProgress(_)) => {
+            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+        }
+        None | Some(TenantSlot::Secondary) => {
+            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+        }
     }
 }
 
@@ -1542,7 +1511,8 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
+{
     let tenants = TENANTS.read().unwrap();
     let m = match &*tenants {
         TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1550,12 +1520,10 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
     };
     Ok(m.iter()
         .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
+            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
             TenantSlot::Secondary => None,
             TenantSlot::InProgress(_) => None,
         })
-        // TODO(sharding): make callers of this function shard-aware
-        .map(|(k, v)| (k.tenant_id, v))
         .collect())
 }
 
@@ -2089,21 +2057,19 @@ use {
 };
 
 pub(crate) async fn immediate_gc(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
     cancel: CancellationToken,
     ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
     let guard = TENANTS.read().unwrap();
-    let tenant = guard
-        .get(&tenant_id)
-        .map(Arc::clone)
-        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
 
-    // TODO(sharding): make callers of this function shard-aware
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let tenant = guard
+        .get(&tenant_shard_id)
+        .map(Arc::clone)
+        .with_context(|| format!("tenant {tenant_shard_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;
 
     let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
     // Use tenant's pitr setting
@@ -2116,9 +2082,9 @@ pub(crate) async fn immediate_gc(
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
         TaskKind::GarbageCollector,
-        Some(tenant_id),
+        Some(tenant_shard_id),
         Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
+        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
         false,
         async move {
             fail::fail_point!("immediate_gc_task_pre");
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 03600cf5ae..3765ff6e7a 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1223,7 +1223,7 @@ impl RemoteTimelineClient {
             task_mgr::spawn(
                 &self.runtime,
                 TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id.tenant_id),
+                Some(self.tenant_shard_id),
                 Some(self.timeline_id),
                 "remote upload",
                 false,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 126d4d5563..112128ead8 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -837,7 +837,7 @@ impl LayerInner {
         crate::task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_shard_id.tenant_id),
+            Some(self.desc.tenant_shard_id),
             Some(self.desc.timeline_id),
             &task_name,
             false,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bc404c41a0..dc23030218 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -87,13 +87,13 @@ pub fn start_background_loops(
     tenant: &Arc<Tenant>,
     background_jobs_can_start: Option<&completion::Barrier>,
 ) {
-    let tenant_id = tenant.tenant_shard_id.tenant_id;
+    let tenant_shard_id = tenant.tenant_shard_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
-        Some(tenant_id),
+        Some(tenant_shard_id),
         None,
-        &format!("compactor for tenant {tenant_id}"),
+        &format!("compactor for tenant {tenant_shard_id}"),
         false,
         {
             let tenant = Arc::clone(tenant);
@@ -105,7 +105,7 @@ pub fn start_background_loops(
                     _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                 };
                 compaction_loop(tenant, cancel)
-                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
                 Ok(())
             }
@@ -114,9 +114,9 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
-        Some(tenant_id),
+        Some(tenant_shard_id),
         None,
-        &format!("garbage collector for tenant {tenant_id}"),
+        &format!("garbage collector for tenant {tenant_shard_id}"),
         false,
         {
             let tenant = Arc::clone(tenant);
@@ -128,7 +128,7 @@ pub fn start_background_loops(
                     _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                 };
                 gc_loop(tenant, cancel)
-                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
                 Ok(())
             }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 551b66b77d..f3907a6d2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -77,7 +77,7 @@ use postgres_ffi::to_pg_timestamp;
 use utils::{
     completion,
     generation::Generation,
-    id::{TenantId, TimelineId},
+    id::TimelineId,
     lsn::{AtomicLsn, Lsn, RecordLsn},
     seqwait::SeqWait,
     simple_rcu::{Rcu, RcuReadGuard},
@@ -926,7 +926,7 @@ impl Timeline {
         tracing::debug!("Waiting for WalReceiverManager...");
         task_mgr::shutdown_tasks(
             Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
         )
         .await;
@@ -977,7 +977,7 @@ impl Timeline {
         // Shut down the layer flush task before the remote client, as one depends on the other
         task_mgr::shutdown_tasks(
             Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
         )
         .await;
@@ -995,12 +995,7 @@ impl Timeline {
 
         tracing::debug!("Waiting for tasks...");
 
-        task_mgr::shutdown_tasks(
-            None,
-            Some(self.tenant_shard_id.tenant_id),
-            Some(self.timeline_id),
-        )
-        .await;
+        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
 
         // Finally wait until any gate-holders are complete
         self.gate.close().await;
@@ -1314,16 +1309,20 @@ impl Timeline {
                 &self.conf.default_tenant_conf,
             );
 
-            // TODO(sharding): make evictions state shard aware
-            // (https://github.com/neondatabase/neon/issues/5953)
             let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
+            let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
 
             let timeline_id_str = self.timeline_id.to_string();
             self.metrics
                 .evictions_with_low_residence_duration
                 .write()
                 .unwrap()
-                .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold);
+                .change_threshold(
+                    &tenant_id_str,
+                    &shard_id_str,
+                    &timeline_id_str,
+                    new_threshold,
+                );
         }
     }
 
@@ -1395,7 +1394,7 @@ impl Timeline {
                 ancestor_lsn: metadata.ancestor_lsn(),
 
                 metrics: TimelineMetrics::new(
-                    &tenant_shard_id.tenant_id,
+                    &tenant_shard_id,
                     &timeline_id,
                     crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
                         "mtime",
@@ -1496,7 +1495,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "layer flush task",
             false,
@@ -1847,7 +1846,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "initial size calculation",
             false,
@@ -2020,7 +2019,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "ondemand logical size calculation",
             false,
@@ -2461,13 +2460,7 @@ impl Timeline {
         // FIXME: It's pointless to check the cache for things that are not 8kB pages.
         // We should look at the key to determine if it's a cacheable object
         let (lsn, read_guard) = cache
-            .lookup_materialized_page(
-                self.tenant_shard_id.tenant_id,
-                self.timeline_id,
-                key,
-                lsn,
-                ctx,
-            )
+            .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
             .await?;
         let img = Bytes::from(read_guard.to_vec());
         Some((lsn, img))
@@ -3209,7 +3202,7 @@ impl DurationRecorder {
 #[derive(Default)]
 struct CompactLevel0Phase1StatsBuilder {
     version: Option<u64>,
-    tenant_id: Option<TenantId>,
+    tenant_id: Option<TenantShardId>,
     timeline_id: Option<TimelineId>,
     read_lock_acquisition_micros: DurationRecorder,
     read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
@@ -3226,7 +3219,7 @@ struct CompactLevel0Phase1StatsBuilder {
 #[derive(serde::Serialize)]
 struct CompactLevel0Phase1Stats {
     version: u64,
-    tenant_id: TenantId,
+    tenant_id: TenantShardId,
     timeline_id: TimelineId,
     read_lock_acquisition_micros: RecordedDuration,
     read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
@@ -3745,7 +3738,7 @@ impl Timeline {
             let ctx = ctx.attached_child();
             let mut stats = CompactLevel0Phase1StatsBuilder {
                 version: Some(2),
-                tenant_id: Some(self.tenant_shard_id.tenant_id),
+                tenant_id: Some(self.tenant_shard_id),
                 timeline_id: Some(self.timeline_id),
                 ..Default::default()
             };
@@ -4207,7 +4200,7 @@ impl Timeline {
                     let cache = page_cache::get();
                     if let Err(e) = cache
                         .memorize_materialized_page(
-                            self.tenant_shard_id.tenant_id,
+                            self.tenant_shard_id,
                             self.timeline_id,
                             key,
                             last_rec_lsn,
@@ -4251,7 +4244,7 @@ impl Timeline {
         let task_id = task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
             "download all remote layers task",
             false,
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 2a103a7ff4..be873181d9 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -43,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
     // Shut down the layer flush task before the remote client, as one depends on the other
     task_mgr::shutdown_tasks(
         Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.tenant_shard_id),
         Some(timeline.timeline_id),
     )
     .await;
@@ -71,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
     info!("waiting for timeline tasks to shutdown");
     task_mgr::shutdown_tasks(
         None,
-        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.tenant_shard_id),
         Some(timeline.timeline_id),
     )
     .await;
@@ -528,7 +528,7 @@ impl DeleteTimelineFlow {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id.tenant_id),
+            Some(tenant_shard_id),
             Some(timeline_id),
             "timeline_delete",
             false,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 3fe4bc0f83..020c5a9e9f 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
-            Some(self.tenant_shard_id.tenant_id),
+            Some(self.tenant_shard_id),
             Some(self.timeline_id),
             &format!(
                 "layer eviction for {}/{}",
@@ -343,7 +343,7 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
             Ok(t) => t,
             Err(_) => {
                 return ControlFlow::Break(());
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 04ff8602d6..e32265afb5 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
 };
 
+use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
@@ -41,7 +42,7 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-use utils::id::TenantTimelineId;
+use utils::id::TimelineId;
 
 use self::connection_manager::ConnectionManagerStatus;
 
@@ -60,7 +61,8 @@ pub struct WalReceiverConf {
 }
 
 pub struct WalReceiver {
-    timeline: TenantTimelineId,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
     manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }
 
@@ -71,7 +73,7 @@ impl WalReceiver {
         mut broker_client: BrokerClientChannel,
         ctx: &RequestContext,
     ) -> Self {
-        let tenant_id = timeline.tenant_shard_id.tenant_id;
+        let tenant_shard_id = timeline.tenant_shard_id;
         let timeline_id = timeline.timeline_id;
         let walreceiver_ctx =
             ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
@@ -81,9 +83,9 @@ impl WalReceiver {
         task_mgr::spawn(
             WALRECEIVER_RUNTIME.handle(),
             TaskKind::WalReceiverManager,
-            Some(tenant_id),
+            Some(timeline.tenant_shard_id),
             Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
+            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
             false,
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
@@ -117,11 +119,12 @@ impl WalReceiver {
                 *loop_status.write().unwrap() = None;
                 Ok(())
             }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
         );
 
         Self {
-            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            tenant_shard_id,
+            timeline_id,
             manager_status,
         }
     }
@@ -129,8 +132,8 @@ impl WalReceiver {
     pub async fn stop(self) {
         task_mgr::shutdown_tasks(
             Some(TaskKind::WalReceiverManager),
-            Some(self.timeline.tenant_id),
-            Some(self.timeline.timeline_id),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
         )
         .await;
     }
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3bcb7ff891..61ab236322 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
     task_mgr::spawn(
         WALRECEIVER_RUNTIME.handle(),
         TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.tenant_shard_id),
         Some(timeline.timeline_id),
         "walreceiver connection",
         false,

From 6a922b1a7543c41ef38ce8daaa7b4c9da271c158 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 11 Dec 2023 16:55:43 +0000
Subject: [PATCH 067/202] tests: start adding tests for secondary mode, live
 migration (#5842)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These tests have been loitering on a branch of mine for a while: they
already provide value even without all the secondary mode bits landed
yet, and the Workload helper is handy for other tests too.

- `Workload` is a re-usable test workload that replaces some of the
arbitrary "write a few rows" SQL that I've found my self repeating, and
adds a systematic way to append data and check that reads properly
reflect the changes. This append+validate stuff is important when doing
migrations, as we want to detect situations where we might be reading
from a pageserver that has not properly seen latest changes.
- test_multi_attach is a validation of how the pageserver handles
attaching the same tenant to multiple pageservers, from a safety point
of view. This is intentionally separate from the larger testing of
migration, to provide an isolated environment for multi-attachment.
- test_location_conf_churn is a pseudo-random walk through the various
states that TenantSlot can be put into, with validation that attached
tenants remain externally readable when they should, and as a side
effect validating that the compute endpoint's online configuration
changes work as expected.
- test_live_migration is the reference implementation of how to drive a
pair of pageservers through a zero-downtime migration of a tenant.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 test_runner/fixtures/neon_fixtures.py         |  27 +-
 test_runner/fixtures/pageserver/http.py       |  19 +-
 test_runner/fixtures/workload.py              | 148 ++++++++
 .../regress/test_pageserver_generations.py    | 103 +++++-
 .../regress/test_pageserver_secondary.py      | 332 ++++++++++++++++++
 5 files changed, 623 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/fixtures/workload.py
 create mode 100644 test_runner/regress/test_pageserver_secondary.py

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c569b63d4e..fb6cea5713 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1712,7 +1712,7 @@ class NeonPageserver(PgProtocol):
 
     @property
     def workdir(self) -> Path:
-        return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}"))
+        return self.env.repo_dir / f"pageserver_{self.id}"
 
     def assert_no_errors(self):
         logfile = self.workdir / "pageserver.log"
@@ -1784,6 +1784,27 @@ class NeonPageserver(PgProtocol):
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
+    def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
+        # This API is only for use when generations are enabled
+        assert self.env.attachment_service is not None
+
+        if config["mode"].startswith("Attached") and "generation" not in config:
+            config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+
+        client = self.http_client()
+        return client.tenant_location_conf(tenant_id, config, **kwargs)
+
+    def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
+        path = self.tenant_dir(tenant_id) / "config-v1"
+        log.info(f"Reading location conf from {path}")
+        bytes = open(path, "r").read()
+        try:
+            decoded: dict[str, Any] = toml.loads(bytes)
+            return decoded
+        except:
+            log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
+            raise
+
     def tenant_create(
         self,
         tenant_id: TenantId,
@@ -2717,6 +2738,7 @@ class EndpointFactory:
         lsn: Optional[Lsn] = None,
         hot_standby: bool = False,
         config_lines: Optional[List[str]] = None,
+        pageserver_id: Optional[int] = None,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -2736,6 +2758,7 @@ class EndpointFactory:
             lsn=lsn,
             hot_standby=hot_standby,
             config_lines=config_lines,
+            pageserver_id=pageserver_id,
         )
 
     def stop_all(self) -> "EndpointFactory":
@@ -3082,7 +3105,7 @@ def pytest_addoption(parser: Parser):
 
 
 SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"config|metadata|.+\.(?:toml|pid|json|sql)"
+    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
 )
 
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 3e75bac424..b46ddf5527 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -150,7 +150,7 @@ class PageserverHttpClient(requests.Session):
                 # (this may change in future if we do fault injection of a kind that causes
                 #  requests TCP flows to stick)
                 read=False,
-                backoff_factor=0,
+                backoff_factor=0.2,
                 status_forcelist=[503],
                 allowed_methods=None,
                 remove_headers_on_redirect=[],
@@ -277,6 +277,23 @@ class PageserverHttpClient(requests.Session):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
         self.verbose_error(res)
 
+    def tenant_location_conf(
+        self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
+    ):
+        body = location_conf.copy()
+        body["tenant_id"] = str(tenant_id)
+
+        params = {}
+        if flush_ms is not None:
+            params["flush_ms"] = str(flush_ms)
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
+            json=body,
+            params=params,
+        )
+        self.verbose_error(res)
+
     def tenant_delete(self, tenant_id: TenantId):
         res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         self.verbose_error(res)
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
new file mode 100644
index 0000000000..241531437c
--- /dev/null
+++ b/test_runner/fixtures/workload.py
@@ -0,0 +1,148 @@
+from typing import Optional
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.types import TenantId, TimelineId
+
+
+class Workload:
+    """
+    This is not a general purpose load generator: it exists for storage tests that need to inject some
+    high level types of storage work via the postgres interface:
+    - layer writes (`write_rows`)
+    - work for compaction (`churn_rows`)
+    - reads, checking we get the right data (`validate`)
+    """
+
+    def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
+        self.env = env
+        self.tenant_id = tenant_id
+        self.timeline_id = timeline_id
+        self.table = "foo"
+
+        self.expect_rows = 0
+        self.churn_cursor = 0
+
+        self._endpoint: Optional[Endpoint] = None
+
+    def endpoint(self, pageserver_id: int) -> Endpoint:
+        if self._endpoint is None:
+            self._endpoint = self.env.endpoints.create(
+                "main",
+                tenant_id=self.tenant_id,
+                pageserver_id=pageserver_id,
+                endpoint_id="ep-workload",
+            )
+            self._endpoint.start(pageserver_id=pageserver_id)
+        else:
+            self._endpoint.reconfigure(pageserver_id=pageserver_id)
+
+        connstring = self._endpoint.safe_psql(
+            "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
+        )
+        log.info(f"Workload.endpoint: connstr={connstring}")
+
+        return self._endpoint
+
+    def __del__(self):
+        if self._endpoint is not None:
+            self._endpoint.stop()
+
+    def init(self, pageserver_id: int):
+        endpoint = self.endpoint(pageserver_id)
+
+        endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
+        endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+        last_flush_lsn_upload(
+            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+        )
+
+    def write_rows(self, n, pageserver_id):
+        endpoint = self.endpoint(pageserver_id)
+        start = self.expect_rows
+        end = start + n - 1
+        self.expect_rows += n
+        dummy_value = "blah"
+        endpoint.safe_psql(
+            f"""
+            INSERT INTO {self.table} (id, val)
+            SELECT g, '{dummy_value}'
+            FROM generate_series({start}, {end}) g
+            """
+        )
+
+        return last_flush_lsn_upload(
+            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+        )
+
+    def churn_rows(self, n, pageserver_id, upload=True):
+        assert self.expect_rows >= n
+
+        max_iters = 10
+        endpoint = self.endpoint(pageserver_id)
+        todo = n
+        i = 0
+        while todo > 0:
+            i += 1
+            if i > max_iters:
+                raise RuntimeError("oops")
+            start = self.churn_cursor % self.expect_rows
+            n_iter = min((self.expect_rows - start), todo)
+            todo -= n_iter
+
+            end = start + n_iter - 1
+
+            log.info(
+                f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}"
+            )
+
+            assert end < self.expect_rows
+
+            self.churn_cursor += n_iter
+            dummy_value = "blah"
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO {self.table} (id, val)
+                SELECT g, '{dummy_value}'
+                FROM generate_series({start}, {end}) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    f"VACUUM {self.table}",
+                ]
+            )
+
+        last_flush_lsn = wait_for_last_flush_lsn(
+            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+        )
+        ps_http = self.env.get_pageserver(pageserver_id).http_client()
+        wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
+
+        if upload:
+            # force a checkpoint to trigger upload
+            ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
+            wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
+            log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+        else:
+            log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+
+    def validate(self, pageserver_id):
+        endpoint = self.endpoint(pageserver_id)
+        result = endpoint.safe_psql_many(
+            [
+                "select clear_buffer_cache()",
+                f"""
+            SELECT COUNT(*) FROM {self.table}
+            """,
+            ]
+        )
+
+        log.info(f"validate({self.expect_rows}): {result}")
+        assert result == [[("",)], [(self.expect_rows,)]]
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 66cc286aba..4488be31c5 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -23,14 +23,20 @@ from fixtures.neon_fixtures import (
     PgBin,
     S3Scrubber,
     last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import list_prefix
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import (
+    assert_tenant_state,
+    list_prefix,
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import print_gc_result, wait_until
+from fixtures.workload import Workload
 
 # A tenant configuration that is convenient for generating uploads and deletions
 # without a large amount of postgres traffic.
@@ -93,7 +99,10 @@ def generate_uploads_and_deletions(
             )
             assert tenant_id is not None
             assert timeline_id is not None
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
+            # in a state where there are "future layers" in remote storage that will generate deletions
+            # after a restart.
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
             ps_http.timeline_checkpoint(tenant_id, timeline_id)
 
         # Compaction should generate some GC-elegible layers
@@ -560,3 +569,91 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
     read_all(env, tenant_id, timeline_id)
     evict_all_layers(env, tenant_id, timeline_id)
     read_all(env, tenant_id, timeline_id)
+
+
+def test_multi_attach(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 3
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    pageservers = env.pageservers
+    http_clients = list([p.http_client() for p in pageservers])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # We will intentionally create situations where stale deletions happen from non-latest-generation
+    # nodes when the tenant is multiply-attached
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+        )
+
+    # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
+    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
+    _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
+    with pytest.raises(PageserverApiException):
+        http_clients[1].timeline_detail(tenant_id, timeline_id)
+    with pytest.raises(PageserverApiException):
+        http_clients[2].timeline_detail(tenant_id, timeline_id)
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(pageservers[0].id)
+    workload.write_rows(1000, pageservers[0].id)
+
+    # Attach the tenant to the other two pageservers
+    pageservers[1].tenant_attach(env.initial_tenant)
+    pageservers[2].tenant_attach(env.initial_tenant)
+
+    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
+    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
+
+    # Now they all have it attached
+    _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients])
+    _detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
+    _detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
+
+    # The endpoint can use any pageserver to service its reads
+    for pageserver in pageservers:
+        workload.validate(pageserver.id)
+
+    # If we write some more data, all the nodes can see it, including stale ones
+    wrote_lsn = workload.write_rows(1000, pageservers[0].id)
+    for ps_http in http_clients:
+        wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn)
+
+    # ...and indeed endpoints can see it via any of the pageservers
+    for pageserver in pageservers:
+        workload.validate(pageserver.id)
+
+    # Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage
+    for ps_http in http_clients:
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn)
+
+    # Now, the contents of remote storage will be a set of layers from each pageserver, but with unique
+    # generation numbers
+    # TODO: validate remote storage contents
+
+    # Stop all pageservers
+    for ps in pageservers:
+        ps.stop()
+
+    # Returning to a normal healthy state: all pageservers will start, but only the one most
+    # recently attached via the control plane will re-attach on startup
+    for ps in pageservers:
+        ps.start()
+
+    with pytest.raises(PageserverApiException):
+        _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
+    with pytest.raises(PageserverApiException):
+        _detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
+    _detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
+
+    # All data we wrote while multi-attached remains readable
+    workload.validate(pageservers[2].id)
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
new file mode 100644
index 0000000000..b14b7f1328
--- /dev/null
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -0,0 +1,332 @@
+import random
+from typing import Any, Dict, Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.remote_storage import RemoteStorageKind
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+from fixtures.workload import Workload
+
+# A tenant configuration that is convenient for generating uploads and deletions
+# without a large amount of postgres traffic.
+TENANT_CONF = {
+    # small checkpointing and compaction targets to ensure we generate many upload operations
+    "checkpoint_distance": f"{128 * 1024}",
+    "compaction_target_size": f"{128 * 1024}",
+    "compaction_threshold": "1",
+    # no PITR horizon, we specify the horizon when we request on-demand GC
+    "pitr_interval": "0s",
+    # disable background compaction and GC. We invoke it manually when we want it to happen.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # create image layers eagerly, so that GC can remove some layers
+    "image_creation_threshold": "1",
+}
+
+
+def evict_random_layers(
+    rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
+):
+    """
+    Evict 50% of the layers on a pageserver
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+    initial_local_layers = sorted(
+        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
+    )
+    client = pageserver.http_client()
+    for layer in initial_local_layers:
+        if "ephemeral" in layer.name or "temp_download" in layer.name:
+            continue
+
+        if rng.choice([True, False]):
+            log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
+            client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
+
+
+@pytest.mark.parametrize("seed", [1, 2, 3])
+def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
+    """
+    Issue many location configuration changes, ensure that tenants
+    remain readable & we don't get any unexpected errors.  We should
+    have no ERROR in the log, and no 500s in the API.
+
+    The location_config API is intentionally designed so that all destination
+    states are valid, so that we may test it in this way: the API should always
+    work as long as the tenant exists.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 3
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    pageservers = env.pageservers
+    list([p.http_client() for p in pageservers])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # We will make no effort to avoid stale attachments
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [
+                ".*Dropped remote consistent LSN updates.*",
+                ".*Dropping stale deletions.*",
+                # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
+                ".*query handler.*Tenant.*not found.*",
+                # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
+                ".*query handler.*Tenant.*not active.*",
+            ]
+        )
+
+        # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
+        message = ".*duplicated L1 layer layer=.*"
+        ps.allowed_errors.append(message)
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, env.pageservers[0].id)
+
+    # We use a fixed seed to make the test reproducible: we want a randomly
+    # chosen order, but not to change the order every time we run the test.
+    rng = random.Random(seed)
+
+    initial_generation = 1
+    last_state = {
+        env.pageservers[0].id: ("AttachedSingle", initial_generation),
+        env.pageservers[1].id: ("Detached", None),
+        env.pageservers[2].id: ("Detached", None),
+    }
+
+    latest_attached = env.pageservers[0].id
+
+    for _i in range(0, 64):
+        # Pick a pageserver
+        pageserver = rng.choice(env.pageservers)
+
+        # Pick a pseudorandom state
+        modes = [
+            "AttachedSingle",
+            "AttachedMulti",
+            "AttachedStale",
+            "Secondary",
+            "Detached",
+            "_Evictions",
+            "_Restart",
+        ]
+
+        mode = rng.choice(modes)
+
+        last_state_ps = last_state[pageserver.id]
+        if mode == "_Evictions":
+            if last_state_ps[0].startswith("Attached"):
+                log.info(f"Action: evictions on pageserver {pageserver.id}")
+                evict_random_layers(rng, pageserver, tenant_id, timeline_id)
+            else:
+                log.info(
+                    f"Action: skipping evictions on pageserver {pageserver.id}, is not attached"
+                )
+        elif mode == "_Restart":
+            log.info(f"Action: restarting pageserver {pageserver.id}")
+            pageserver.stop()
+            pageserver.start()
+            if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
+                log.info("Entering postgres...")
+                workload.churn_rows(rng.randint(128, 256), pageserver.id)
+                workload.validate(pageserver.id)
+            elif last_state_ps[0].startswith("Attached"):
+                # The `attachment_service` will only re-attach on startup when a pageserver was the
+                # holder of the latest generation: otherwise the pageserver will revert to detached
+                # state if it was running attached with a stale generation
+                last_state[pageserver.id] = ("Detached", None)
+        else:
+            secondary_conf: Optional[Dict[str, Any]] = None
+            if mode == "Secondary":
+                secondary_conf = {"warm": rng.choice([True, False])}
+
+            location_conf: Dict[str, Any] = {
+                "mode": mode,
+                "secondary_conf": secondary_conf,
+                "tenant_conf": {},
+            }
+
+            log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}")
+
+            # Select a generation number
+            if mode.startswith("Attached"):
+                if last_state_ps[1] is not None:
+                    if rng.choice([True, False]):
+                        # Move between attached states, staying in the same generation
+                        generation = last_state_ps[1]
+                    else:
+                        # Switch generations, while also jumping between attached states
+                        generation = env.attachment_service.attach_hook_issue(
+                            tenant_id, pageserver.id
+                        )
+                        latest_attached = pageserver.id
+                else:
+                    generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id)
+                    latest_attached = pageserver.id
+            else:
+                generation = None
+
+            location_conf["generation"] = generation
+
+            pageserver.tenant_location_configure(tenant_id, location_conf)
+            last_state[pageserver.id] = (mode, generation)
+
+            if mode.startswith("Attached"):
+                # This is a basic test: we are validating that he endpoint works properly _between_
+                # configuration changes.  A stronger test would be to validate that clients see
+                # no errors while we are making the changes.
+                workload.churn_rows(
+                    rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale"
+                )
+                workload.validate(pageserver.id)
+
+    # Attach all pageservers
+    for ps in env.pageservers:
+        location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
+        ps.tenant_location_configure(tenant_id, location_conf)
+
+    # Confirm that all are readable
+    for ps in env.pageservers:
+        workload.validate(ps.id)
+
+    # Detach all pageservers
+    for ps in env.pageservers:
+        location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
+        ps.tenant_location_configure(tenant_id, location_conf)
+
+    # Confirm that all local disk state was removed on detach
+    # TODO
+
+
+def test_live_migration(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the sequence of location states that are used in a live migration.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageserver_a = env.pageservers[0]
+    pageserver_b = env.pageservers[1]
+
+    initial_generation = 1
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, env.pageservers[0].id)
+
+    # Make the destination a secondary location
+    pageserver_b.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+
+    workload.churn_rows(64, pageserver_a.id, upload=False)
+
+    # Set origin attachment to stale
+    log.info("Setting origin to AttachedStale")
+    pageserver_a.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "AttachedStale",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": initial_generation,
+        },
+        flush_ms=5000,
+    )
+
+    migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
+    log.info(f"Acquired generation {migrated_generation} for destination pageserver")
+    assert migrated_generation == initial_generation + 1
+
+    # Writes and reads still work in AttachedStale.
+    workload.validate(pageserver_a.id)
+
+    # TODO: call into secondary mode API hooks to do an upload/download sync
+
+    # Generate some more dirty writes: we expect the origin to ingest WAL in
+    # in AttachedStale
+    workload.churn_rows(64, pageserver_a.id, upload=False)
+    workload.validate(pageserver_a.id)
+
+    # Attach the destination
+    log.info("Setting destination to AttachedMulti")
+    pageserver_b.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "AttachedMulti",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": migrated_generation,
+        },
+    )
+
+    # Wait for destination LSN to catch up with origin
+    origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[
+        "last_record_lsn"
+    ]
+
+    def caught_up():
+        destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[
+            "last_record_lsn"
+        ]
+        log.info(
+            f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}"
+        )
+        assert destination_lsn >= origin_lsn
+
+    wait_until(100, 0.1, caught_up)
+
+    # The destination should accept writes
+    workload.churn_rows(64, pageserver_b.id)
+
+    # Dual attached: both are readable.
+    workload.validate(pageserver_a.id)
+    workload.validate(pageserver_b.id)
+
+    # Revert the origin to secondary
+    log.info("Setting origin to Secondary")
+    pageserver_a.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+
+    workload.churn_rows(64, pageserver_b.id)
+
+    # Put the destination into final state
+    pageserver_b.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": migrated_generation,
+        },
+    )
+
+    workload.churn_rows(64, pageserver_b.id)
+    workload.validate(pageserver_b.id)

From 036558c956da0833776a68a953e9cff443442a83 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Dec 2023 10:25:43 -0600
Subject: [PATCH 068/202] Fix git ownership issue in check-codestyle-rust-arm

We have this workaround for other jobs. Looks like this one was
forgotten about.
---
 .github/workflows/neon_extra_builds.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 0d7db8dfbc..09a106fb52 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -238,6 +238,20 @@ jobs:
       options: --init
 
     steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
       - name: Checkout
         uses: actions/checkout@v4
         with:

From 5ab9592a2dd2203f8f17e97ad6b69b28e63fe383 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Dec 2023 10:46:41 -0600
Subject: [PATCH 069/202] Add submodule paths as safe directories as a
 precaution

The check-codestyle-rust-arm job requires this for some reason, so let's
just add them everywhere we do this workaround.
---
 .github/workflows/build_and_test.yml    | 8 ++++++++
 .github/workflows/neon_extra_builds.yml | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 820848b4fb..693ed1a66f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,6 +199,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v3
@@ -1097,6 +1101,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 09a106fb52..b1ea5e4f74 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -142,6 +142,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v4

From c49fd69bd6f7f109f3c9100ee8544d132ccee020 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 11 Dec 2023 22:08:14 +0100
Subject: [PATCH 070/202] Add initdb_lsn to TimelineInfo (#6104)

This way, we can query it.

Background: I want to do statistics for how reproducible `initdb_lsn`
really is, see https://github.com/neondatabase/cloud/issues/8284 and
https://neondb.slack.com/archives/C036U0GRMRB/p1701895218280269
---
 libs/pageserver_api/src/models.rs | 3 +++
 pageserver/src/http/routes.rs     | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 2572bcf74f..a3029e67a5 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -385,6 +385,9 @@ pub struct TimelineInfo {
     /// The LSN that we are advertizing to safekeepers
     pub remote_consistent_lsn_visible: Lsn,
 
+    /// The LSN from the start of the root timeline (never changes)
+    pub initdb_lsn: Lsn,
+
     pub current_logical_size: u64,
     pub current_logical_size_is_accurate: bool,
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b9b0250671..fee50460a5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -319,6 +319,7 @@ async fn build_timeline_info_common(
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
     crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+    let initdb_lsn = timeline.initdb_lsn;
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -359,6 +360,7 @@ async fn build_timeline_info_common(
         disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
         remote_consistent_lsn: remote_consistent_lsn_projected,
         remote_consistent_lsn_visible,
+        initdb_lsn,
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
         latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -506,7 +508,7 @@ async fn timeline_list_handler(
         }
         Ok::<Vec<TimelineInfo>, ApiError>(response_data)
     }
-    .instrument(info_span!("timeline_list", 
+    .instrument(info_span!("timeline_list",
                 tenant_id = %tenant_shard_id.tenant_id,
                 shard_id = %tenant_shard_id.shard_slug()))
     .await?;
@@ -545,7 +547,7 @@ async fn timeline_detail_handler(
 
         Ok::<_, ApiError>(timeline_info)
     }
-    .instrument(info_span!("timeline_detail", 
+    .instrument(info_span!("timeline_detail",
                 tenant_id = %tenant_shard_id.tenant_id,
                 shard_id = %tenant_shard_id.shard_slug(),
                 %timeline_id))
@@ -843,7 +845,7 @@ async fn tenant_status(
             attachment_status: state.attachment_status(),
         })
     }
-    .instrument(info_span!("tenant_status_handler", 
+    .instrument(info_span!("tenant_status_handler",
                 tenant_id = %tenant_shard_id.tenant_id,
                 shard_id = %tenant_shard_id.shard_slug()))
     .await?;

From 3b04f3a7490562999397ed4263fb7cd0a817addb Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Dec 2023 23:27:53 +0200
Subject: [PATCH 071/202] fix: accidential return Ok (#6106)

Error indicating request cancellation OR timeline shutdown was deemed as
a reason to exit the background worker that calculated synthetic size.
Fix it to only be considered for avoiding logging such of such errors.
---
 pageserver/src/consumption_metrics.rs | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index bb13bdd5e5..8f2b88d191 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -287,14 +287,20 @@ async fn calculate_synthetic_size_worker(
                 // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                 // which turns out is really handy to understand the system.
                 if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
-                    if let Some(PageReconstructError::Cancelled) =
-                        e.downcast_ref::<PageReconstructError>()
-                    {
-                        return Ok(());
-                    }
-                    error!(
-                        "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
+                    // this error can be returned if timeline is shutting down, but it does not
+                    // mean the synthetic size worker should terminate. we do not need any checks
+                    // in this function because `mgr::get_tenant` will error out after shutdown has
+                    // progressed to shutting down tenants.
+                    let is_cancelled = matches!(
+                        e.downcast_ref::<PageReconstructError>(),
+                        Some(PageReconstructError::Cancelled)
                     );
+
+                    if !is_cancelled {
+                        error!(
+                            "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
+                        );
+                    }
                 }
             }
         }
@@ -307,7 +313,7 @@ async fn calculate_synthetic_size_worker(
 
         let res = tokio::time::timeout_at(
             started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
+            cancel.cancelled(),
         )
         .await;
         if res.is_ok() {

From 20e9cf7d3131357b2ae9fa702795a6981e1347a9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Dec 2023 07:19:59 +0000
Subject: [PATCH 072/202] pageserver: tweaks to slow/hung task logging (#6098)

## Problem

- `shutdown_tasks` would log when a particular task was taking a long
time to shut down, but not when it eventually completed. That left one
uncertain as to whether the slow task was the source of a hang, or just
a precursor.

## Summary of changes

- Add a log line after a slow task shutdown
- Add an equivalent in Gate's `warn_if_stuck`, in case we ever need it.
This isn't related to the original issue but was noticed when checking
through these logging paths.
---
 libs/utils/src/sync/gate.rs | 18 ++++++++++++++++--
 pageserver/src/task_mgr.rs  |  3 ++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 9aad0af22d..31c76d2f74 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -30,18 +30,32 @@ async fn warn_if_stuck<Fut: std::future::Future>(
 
     let mut fut = std::pin::pin!(fut);
 
-    loop {
+    let mut warned = false;
+    let ret = loop {
         match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => return ret,
+            Ok(ret) => break ret,
             Err(_) => {
                 tracing::warn!(
                     gate = name,
                     elapsed_ms = started.elapsed().as_millis(),
                     "still waiting, taking longer than expected..."
                 );
+                warned = true;
             }
         }
+    };
+
+    // If we emitted a warning for slowness, also emit a message when we complete, so that
+    // someone debugging a shutdown can know for sure whether we have moved past this operation.
+    if warned {
+        tracing::info!(
+            gate = name,
+            elapsed_ms = started.elapsed().as_millis(),
+            "completed, after taking longer than expected"
+        )
     }
+
+    ret
 }
 
 #[derive(Debug)]
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5786356720..8747d9ad50 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -518,12 +518,13 @@ pub async fn shutdown_tasks(
             {
                 // allow some time to elapse before logging to cut down the number of log
                 // lines.
-                info!("waiting for {} to shut down", task.name);
+                info!("waiting for task {} to shut down", task.name);
                 // we never handled this return value, but:
                 // - we don't deschedule which would lead to is_cancelled
                 // - panics are already logged (is_panicked)
                 // - task errors are already logged in the wrapper
                 let _ = join_handle.await;
+                info!("task {} completed", task.name);
             }
         } else {
             // Possibly one of:

From fead836f26aa8dc93a9876812438e5d81ca23b42 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Dec 2023 10:39:34 +0000
Subject: [PATCH 073/202] swagger: remove 'format: hex' from tenant IDs (#6099)

## Problem

TenantId is changing to TenantShardId in many APIs. The swagger had
`format: hex` attributes on some of these IDs. That isn't formally
defined anywhere, but a reasonable person might think it means "hex
digits only", which will no longer be the case once we start using
shard-aware IDs (they're like `<tenant_id>-0001`).



## Summary of changes

- Remove these `format` attributes from all `tenant_id` fields in the
swagger definition
---
 pageserver/src/http/openapi_spec.yml | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 237109abfe..9422ccb2fd 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -84,7 +84,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     get:
       description: Get tenant status
       responses:
@@ -181,7 +180,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     get:
       description: Get timelines for tenant
       responses:
@@ -232,7 +230,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: timeline_id
         in: path
         required: true
@@ -338,7 +335,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: timeline_id
         in: path
         required: true
@@ -401,7 +397,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: timeline_id
         in: path
         required: true
@@ -469,7 +464,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: timeline_id
         in: path
         required: true
@@ -523,7 +517,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     post:
       description: |
         Schedules attach operation to happen in the background for the given tenant.
@@ -631,7 +624,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: flush_ms
         in: query
         required: false
@@ -724,7 +716,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: detach_ignored
         in: query
         required: false
@@ -784,7 +775,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     post:
       description: |
         Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -833,7 +823,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     post:
       description: |
         Schedules an operation that attempts to load a tenant from the local disk and
@@ -890,7 +879,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     get:
       description: |
         Calculate tenant's synthetic size
@@ -933,7 +921,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
       - name: inputs_only
         in: query
         required: false
@@ -1003,7 +990,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     post:
       description: |
         Create a timeline. Returns new timeline id on success.\
@@ -1137,7 +1123,6 @@ paths:
             application/json:
               schema:
                 type: string
-                format: hex
         "400":
           description: Malformed tenant create request
           content:
@@ -1234,7 +1219,6 @@ paths:
         required: true
         schema:
           type: string
-          format: hex
     get:
       description: |
         Returns tenant's config description: specific config overrides a tenant has
@@ -1340,7 +1324,6 @@ components:
           properties:
             new_tenant_id:
               type: string
-              format: hex
             generation:
               type: integer
               description: Attachment generation number.
@@ -1369,7 +1352,6 @@ components:
           properties:
             tenant_id:
               type: string
-              format: hex
     TenantLocationConfigRequest:
       type: object
       required:
@@ -1377,7 +1359,6 @@ components:
       properties:
         tenant_id:
           type: string
-          format: hex
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1446,7 +1427,6 @@ components:
           format: hex
         tenant_id:
           type: string
-          format: hex
         last_record_lsn:
           type: string
           format: hex

From 9e071e445814dd57af30eb589f61beec4a746c54 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 12 Dec 2023 12:42:51 +0100
Subject: [PATCH 074/202] Propagate information about the protocol to console
 (#6102)

## Problem

In snowflake logs currently there is no information about the protocol,
that the client uses.

## Summary of changes

Propagate the information about the protocol together with the app_name.
In format: `{app_name}/{sql_over_http/tcp/ws}`.

This will give to @stepashka more observability on what our clients are
using.
---
 proxy/src/auth/backend.rs          | 10 +++++-----
 proxy/src/console/provider.rs      | 12 ++++++------
 proxy/src/console/provider/mock.rs |  6 +++---
 proxy/src/console/provider/neon.rs | 20 ++++++++++----------
 proxy/src/proxy.rs                 | 13 ++++++++-----
 proxy/src/proxy/tests.rs           |  4 ++--
 proxy/src/serverless/conn_pool.rs  |  4 ++--
 7 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 649b3f40f2..ba054b53eb 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -166,7 +166,7 @@ impl TryFrom<ClientCredentials> for ComputeUserInfo {
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
     api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
+    extra: &ConsoleReqExtra,
     creds: ClientCredentials,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
@@ -235,7 +235,7 @@ async fn auth_quirks(
 /// only if authentication was successfuly.
 async fn auth_and_wake_compute(
     api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
+    extra: &ConsoleReqExtra,
     creds: ClientCredentials,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
@@ -314,7 +314,7 @@ impl<'a> BackendType<'a, ClientCredentials> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
         self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
@@ -387,7 +387,7 @@ impl<'a> BackendType<'a, ClientCredentials> {
 impl BackendType<'_, ComputeUserInfo> {
     pub async fn get_allowed_ips(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         use BackendType::*;
         match self {
@@ -404,7 +404,7 @@ impl BackendType<'_, ComputeUserInfo> {
     /// The link auth flow doesn't support this, so we return [`None`] in that case.
     pub async fn wake_compute(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
     ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
         use BackendType::*;
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index b0a73fd03d..deab966d9e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -196,15 +196,15 @@ pub mod errors {
 }
 
 /// Extra query params we'd like to pass to the console.
-pub struct ConsoleReqExtra<'a> {
+pub struct ConsoleReqExtra {
     /// A unique identifier for a connection.
     pub session_id: uuid::Uuid,
     /// Name of client application, if set.
-    pub application_name: Option<&'a str>,
+    pub application_name: String,
     pub options: Vec<(String, String)>,
 }
 
-impl<'a> ConsoleReqExtra<'a> {
+impl ConsoleReqExtra {
     // https://swagger.io/docs/specification/serialization/ DeepObject format
     // paramName[prop1]=value1&paramName[prop2]=value2&....
     pub fn options_as_deep_object(&self) -> Vec<(String, String)> {
@@ -259,20 +259,20 @@ pub trait Api {
     /// Get the client's auth secret for authentication.
     async fn get_auth_info(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 8aad8c06bc..c464b4daf2 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -144,7 +144,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_auth_info(
         &self,
-        _extra: &ConsoleReqExtra<'_>,
+        _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         self.do_get_auth_info(creds).await
@@ -152,7 +152,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips(
         &self,
-        _extra: &ConsoleReqExtra<'_>,
+        _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
@@ -161,7 +161,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        _extra: &ConsoleReqExtra<'_>,
+        _extra: &ConsoleReqExtra,
         _creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute()
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f8c3ee5b58..192252a0df 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -48,7 +48,7 @@ impl Api {
 
     async fn do_get_auth_info(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         let request_id = uuid::Uuid::new_v4().to_string();
@@ -60,9 +60,9 @@ impl Api {
                 .header("Authorization", format!("Bearer {}", &self.jwt))
                 .query(&[("session_id", extra.session_id)])
                 .query(&[
-                    ("application_name", extra.application_name),
-                    ("project", Some(&creds.endpoint)),
-                    ("role", Some(&creds.inner.user)),
+                    ("application_name", extra.application_name.as_str()),
+                    ("project", creds.endpoint.as_str()),
+                    ("role", creds.inner.user.as_str()),
                 ])
                 .build()?;
 
@@ -101,7 +101,7 @@ impl Api {
 
     async fn do_wake_compute(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
@@ -113,8 +113,8 @@ impl Api {
                 .header("Authorization", format!("Bearer {}", &self.jwt))
                 .query(&[("session_id", extra.session_id)])
                 .query(&[
-                    ("application_name", extra.application_name),
-                    ("project", Some(&creds.endpoint)),
+                    ("application_name", extra.application_name.as_str()),
+                    ("project", creds.endpoint.as_str()),
                 ]);
 
             request_builder = if extra.options.is_empty() {
@@ -161,7 +161,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_auth_info(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         self.do_get_auth_info(extra, creds).await
@@ -169,7 +169,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         let key: &str = &creds.endpoint;
@@ -192,7 +192,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        extra: &ConsoleReqExtra<'_>,
+        extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key: &str = &creds.inner.cache_key;
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 018f774c7e..152c894ca9 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -671,7 +671,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
 pub async fn connect_to_compute<M: ConnectMechanism>(
     mechanism: &M,
     mut node_info: console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra<'_>,
+    extra: &console::ConsoleReqExtra,
     creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
     mut latency_timer: LatencyTimer,
 ) -> Result<M::Connection, M::Error>
@@ -968,13 +968,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             allow_self_signed_compute,
         } = self;
 
+        let proto = mode.protocol_label();
         let extra = console::ConsoleReqExtra {
             session_id, // aka this connection's id
-            application_name: params.get("application_name"),
+            application_name: format!(
+                "{}/{}",
+                params.get("application_name").unwrap_or_default(),
+                proto
+            ),
             options: neon_options(params),
         };
-
-        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
+        let mut latency_timer = LatencyTimer::new(proto);
 
         let user = creds.get_user().to_owned();
         let auth_result = match creds
@@ -1012,7 +1016,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         .or_else(|e| stream.throw_error(e))
         .await?;
 
-        let proto = mode.protocol_label();
         NUM_DB_CONNECTIONS_OPENED_COUNTER
             .with_label_values(&[proto])
             .inc();
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 31c3ad1055..4691abbfb9 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -484,13 +484,13 @@ fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
 ) -> (
     CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
+    console::ConsoleReqExtra,
     auth::BackendType<'_, ComputeUserInfo>,
 ) {
     let cache = helper_create_cached_node_info();
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
-        application_name: Some("TEST"),
+        application_name: "TEST".into(),
         options: vec![],
     };
     let creds = auth::BackendType::Test(mechanism);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 734df11368..4f3b31b9be 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -37,7 +37,7 @@ use crate::proxy::ConnectMechanism;
 use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
-pub const APP_NAME: &str = "sql_over_http";
+pub const APP_NAME: &str = "/sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;
 
 #[derive(Debug, Clone)]
@@ -432,7 +432,7 @@ async fn connect_to_compute(
 
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
-        application_name: Some(APP_NAME),
+        application_name: APP_NAME.to_string(),
         options: console_options,
     };
     // TODO(anna): this is a bit hacky way, consider using console notification listener.

From 8bb4a13192c853cad2ab46a50e351324563e1ce2 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 12 Dec 2023 14:23:45 +0200
Subject: [PATCH 075/202] Do not materialize null images in PS (#5979)

## Problem

PG16 is writing null images during relation extension.
And page server implements optimisation which replace WAL record with
FPI with page image.
So instead of WAL record ~30 bytes we store 8kb null page image.
Ans this image is almost useless, because most likely it will be shortly
rewritten with actual page content.

## Summary of changes

Do not materialize wal records with null page FPI.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/walingest.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 75b29a2fed..738216afa5 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -458,8 +458,10 @@ impl<'a> WalIngest<'a> {
             && decoded.xl_rmid == pg_constants::RM_XLOG_ID
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-        // compression of WAL is not yet supported: fall back to storing the original WAL record
+            // compression of WAL is not yet supported: fall back to storing the original WAL record
             && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            // do not materialize null pages because them most likely be soon replaced with real data
+            && blk.bimg_len != 0
         {
             // Extract page image from FPI record
             let img_len = blk.bimg_len as usize;

From aec1acdbacd760833935f1140e245c0533182c6f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 12 Dec 2023 14:24:21 +0200
Subject: [PATCH 076/202] Do not inherite replication slots in branch (#5898)

## Problem

See
https://github.com/neondatabase/company_projects/issues/111
https://neondb.slack.com/archives/C03H1K0PGKH/p1700166126954079


## Summary of changes

Do not search for AUX_FILES_KEY in parent timelines

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 pageserver/src/pgdatadir_mapping.rs           | 25 +++++++++++------
 pageserver/src/tenant/timeline.rs             |  4 +--
 .../regress/test_logical_replication.py       | 27 +++++++++++++++++++
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c653f0b7ea..b81037ae47 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -822,10 +822,7 @@ impl<'a> DatadirModification<'a> {
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
         // Create AuxFilesDirectory
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+        self.init_aux_dir()?;
 
         let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
             xids: HashSet::new(),
@@ -933,10 +930,7 @@ impl<'a> DatadirModification<'a> {
             self.put(DBDIR_KEY, Value::Image(buf.into()));
 
             // Create AuxFilesDirectory as well
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: HashMap::new(),
-            })?;
-            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+            self.init_aux_dir()?;
         }
         if r.is_none() {
             // Create RelDirectory
@@ -1261,6 +1255,14 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
+        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            files: HashMap::new(),
+        })?;
+        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+        Ok(())
+    }
+
     pub async fn put_file(
         &mut self,
         path: &str,
@@ -1767,6 +1769,13 @@ const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f3907a6d2b..81dbc04793 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,7 +66,7 @@ use crate::metrics::{
     TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
 use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
 use pageserver_api::reltag::RelTag;
@@ -2278,7 +2278,7 @@ impl Timeline {
             }
 
             // Recurse into ancestor if needed
-            if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
                 trace!(
                     "going into ancestor {}, cont_lsn is {}",
                     timeline.ancestor_lsn,
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index d2d8d71e3f..51e358e60d 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -236,3 +236,30 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     assert vanilla_pg.safe_psql(
         "select sum(somedata) from replication_example"
     ) == endpoint.safe_psql("select sum(somedata) from replication_example")
+
+
+#
+# Check that slots are not inherited in brnach
+#
+def test_slots_and_branching(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    tenant, timeline = env.neon_cli.create_tenant()
+    env.pageserver.http_client()
+
+    main_branch = env.endpoints.create_start("main", tenant_id=tenant)
+    main_cur = main_branch.connect().cursor()
+
+    # Create table and insert some data
+    main_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
+
+    wait_for_last_flush_lsn(env, main_branch, tenant, timeline)
+
+    # Create branch ws.
+    env.neon_cli.create_branch("ws", "main", tenant_id=tenant)
+    ws_branch = env.endpoints.create_start("ws", tenant_id=tenant)
+    log.info("postgres is running on 'ws' branch")
+
+    # Check that we can create slot with the same name
+    ws_cur = ws_branch.connect().cursor()
+    ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")

From 6acbee23680317b317bc87dc86b504fcabaa8fd3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 12 Dec 2023 16:24:13 +0000
Subject: [PATCH 077/202] test_runner: add `from_repo_dir` method (#6087)

## Problem

We need a reliable way to restore a project state (in this context, I
mean data on pageservers, safekeepers, and remote storage) from a
snapshot. The existing method (that we use in `test_compatibility`)
heavily relies on config files, which makes it harder to add/change
fields in the config.
The proposed solution uses config file only to get `default_tenant_id`
and `branch_name_mappings`.

## Summary of changes
- Add `NeonEnvBuilder#from_repo_dir` method, which allows using the
`neon_env_builder` fixture with data from a snapshot.
- Use `NeonEnvBuilder#from_repo_dir` in compatibility tests

Requires for https://github.com/neondatabase/neon/issues/6033
---
 test_runner/fixtures/neon_fixtures.py     |  60 +++++
 test_runner/regress/test_compatibility.py | 264 ++++------------------
 2 files changed, 101 insertions(+), 223 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fb6cea5713..4b23650960 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -507,6 +507,66 @@ class NeonEnvBuilder:
 
         return env
 
+    def from_repo_dir(
+        self,
+        repo_dir: Path,
+        neon_binpath: Optional[Path] = None,
+        pg_distrib_dir: Optional[Path] = None,
+    ) -> NeonEnv:
+        """
+        A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir.
+        """
+
+        # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests
+        self.neon_binpath = neon_binpath or self.neon_binpath
+        self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir
+
+        # Get the initial tenant and timeline from the snapshot config
+        snapshot_config_toml = repo_dir / "config"
+        with snapshot_config_toml.open("r") as f:
+            snapshot_config = toml.load(f)
+
+        self.initial_tenant = TenantId(snapshot_config["default_tenant_id"])
+        self.initial_timeline = TimelineId(
+            dict(snapshot_config["branch_name_mappings"][DEFAULT_BRANCH_NAME])[
+                str(self.initial_tenant)
+            ]
+        )
+        self.env = self.init_configs()
+
+        for ps_dir in repo_dir.glob("pageserver_*"):
+            tenants_from_dir = ps_dir / "tenants"
+            tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
+
+            log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
+            shutil.copytree(tenants_from_dir, tenants_to_dir)
+
+        for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
+            sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name
+            log.info(f"Copying safekeeper directory {sk_from_dir} to {sk_to_dir}")
+            sk_to_dir.rmdir()
+            shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid"))
+
+        shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
+        shutil.copytree(
+            repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
+        )
+
+        if (attachments_json := Path(repo_dir / "attachments.json")).exists():
+            shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
+
+        # Update the config with info about tenants and timelines
+        with (self.repo_dir / "config").open("r") as f:
+            config = toml.load(f)
+
+        config["default_tenant_id"] = snapshot_config["default_tenant_id"]
+        config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
+
+        with (self.repo_dir / "config").open("w") as f:
+            toml.dump(config, f)
+
+        return self.env
+
     def enable_scrub_on_exit(self):
         """
         Call this if you would like the fixture to automatically run
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 35963c0d41..3f5de100fd 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -1,30 +1,25 @@
-import copy
 import os
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, List, Optional
+from typing import List, Optional
 
 import pytest
 import toml
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    NeonCli,
+    NeonEnv,
     NeonEnvBuilder,
     PgBin,
 )
-from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
     wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser
+from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn
-from pytest import FixtureRequest
 
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -37,8 +32,8 @@ from pytest import FixtureRequest
 #   If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true.
 #
 # The file contains a couple of helper functions:
-# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files).
 # - check_neon_works performs the test itself, feel free to add more checks there.
+# - dump_differs compares two SQL dumps and writes the diff to a file.
 #
 #
 # How to run `test_backward_compatibility` locally:
@@ -46,6 +41,7 @@ from pytest import FixtureRequest
 #    export DEFAULT_PG_VERSION=15
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
+#    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
 #
 #    # Build previous version of binaries and create a data snapshot:
 #    rm -rf pg_install target
@@ -59,8 +55,7 @@ from pytest import FixtureRequest
 #    CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc`
 #
 #    # Run backward compatibility test
-#    COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} \
-#       ./scripts/pytest -k test_backward_compatibility
+#    ./scripts/pytest -k test_backward_compatibility
 #
 #
 # How to run `test_forward_compatibility` locally:
@@ -68,6 +63,8 @@ from pytest import FixtureRequest
 #    export DEFAULT_PG_VERSION=15
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
+#    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
+#    export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
 #
 #    # Build previous version of binaries and store them somewhere:
 #    rm -rf pg_install target
@@ -84,9 +81,7 @@ from pytest import FixtureRequest
 #    ./scripts/pytest -k test_create_snapshot
 #
 #    # Run forward compatibility test
-#    COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} \
-#    COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install \
-#       ./scripts/pytest -k test_forward_compatibility
+#    ./scripts/pytest -k test_forward_compatibility
 #
 
 check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
@@ -155,13 +150,9 @@ def test_create_snapshot(
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_backward_compatibility(
-    pg_bin: PgBin,
-    port_distributor: PortDistributor,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    neon_binpath: Path,
-    pg_distrib_dir: Path,
     pg_version: PgVersion,
-    request: FixtureRequest,
 ):
     """
     Test that the new binaries can read old data
@@ -177,23 +168,15 @@ def test_backward_compatibility(
     )
 
     try:
-        # Copy the snapshot to current directory, and prepare for the test
-        prepare_snapshot(
-            from_dir=compatibility_snapshot_dir,
-            to_dir=test_output_dir / "compatibility_snapshot",
-            port_distributor=port_distributor,
-        )
+        neon_env_builder.num_safekeepers = 3
+        env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
+        neon_env_builder.start()
 
         check_neon_works(
-            test_output_dir / "compatibility_snapshot" / "repo",
-            neon_binpath,
-            neon_binpath,
-            pg_distrib_dir,
-            pg_version,
-            port_distributor,
-            test_output_dir,
-            pg_bin,
-            request,
+            env,
+            test_output_dir=test_output_dir,
+            sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+            repo_dir=env.repo_dir,
         )
     except Exception:
         if breaking_changes_allowed:
@@ -212,12 +195,10 @@ def test_backward_compatibility(
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     top_output_dir: Path,
-    port_distributor: PortDistributor,
     pg_version: PgVersion,
-    request: FixtureRequest,
-    neon_binpath: Path,
 ):
     """
     Test that the old binaries can read new data
@@ -244,24 +225,19 @@ def test_forward_compatibility(
     )
 
     try:
-        # Copy the snapshot to current directory, and prepare for the test
-        prepare_snapshot(
-            from_dir=compatibility_snapshot_dir,
-            to_dir=test_output_dir / "compatibility_snapshot",
-            port_distributor=port_distributor,
+        neon_env_builder.num_safekeepers = 3
+        env = neon_env_builder.from_repo_dir(
+            compatibility_snapshot_dir / "repo",
+            neon_binpath=compatibility_neon_bin,
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
+        neon_env_builder.start()
 
         check_neon_works(
-            test_output_dir / "compatibility_snapshot" / "repo",
-            compatibility_neon_bin,
-            neon_binpath,
-            compatibility_postgres_distrib_dir,
-            pg_version,
-            port_distributor,
-            test_output_dir,
-            PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version),
-            request,
+            env,
+            test_output_dir=test_output_dir,
+            sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+            repo_dir=env.repo_dir,
         )
     except Exception:
         if breaking_changes_allowed:
@@ -276,189 +252,26 @@ def test_forward_compatibility(
     ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
-def prepare_snapshot(
-    from_dir: Path,
-    to_dir: Path,
-    port_distributor: PortDistributor,
-    pg_distrib_dir: Optional[Path] = None,
-):
-    assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
-    assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory"
-    assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql"
+def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
+    ep = env.endpoints.create_start("main")
+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
 
-    log.info(f"Copying snapshot from {from_dir} to {to_dir}")
-    shutil.copytree(from_dir, to_dir)
-
-    repo_dir = to_dir / "repo"
-
-    snapshot_config_toml = repo_dir / "config"
-    snapshot_config = toml.load(snapshot_config_toml)
-
-    # Remove old logs to avoid confusion in test artifacts
-    for logfile in repo_dir.glob("**/*.log"):
-        logfile.unlink()
-
-    # Remove old computes in 'endpoints'. Old versions of the control plane used a directory
-    # called "pgdatadirs". Delete it, too.
-    if (repo_dir / "endpoints").exists():
-        shutil.rmtree(repo_dir / "endpoints")
-    if (repo_dir / "pgdatadirs").exists():
-        shutil.rmtree(repo_dir / "pgdatadirs")
-    os.mkdir(repo_dir / "endpoints")
-
-    # Update paths and ports in config files
-    legacy_pageserver_toml = repo_dir / "pageserver.toml"
-    legacy_bundle = os.path.exists(legacy_pageserver_toml)
-
-    path_to_config: dict[Path, dict[Any, Any]] = {}
-    if legacy_bundle:
-        os.mkdir(repo_dir / "pageserver_1")
-        path_to_config[repo_dir / "pageserver_1" / "pageserver.toml"] = toml.load(
-            legacy_pageserver_toml
-        )
-        os.remove(legacy_pageserver_toml)
-        os.rename(repo_dir / "tenants", repo_dir / "pageserver_1" / "tenants")
-    else:
-        for ps_conf in snapshot_config["pageservers"]:
-            config_path = repo_dir / f"pageserver_{ps_conf['id']}" / "pageserver.toml"
-            path_to_config[config_path] = toml.load(config_path)
-
-    # For each pageserver config, edit it and rewrite
-    for config_path, pageserver_config in path_to_config.items():
-        pageserver_config["remote_storage"]["local_path"] = str(
-            LocalFsStorage.component_path(repo_dir, RemoteStorageUser.PAGESERVER)
-        )
-
-        for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
-            pageserver_config[param] = port_distributor.replace_with_new_port(
-                pageserver_config[param]
-            )
-
-        # We don't use authentication in compatibility tests
-        # so just remove authentication related settings.
-        pageserver_config.pop("pg_auth_type", None)
-        pageserver_config.pop("http_auth_type", None)
-
-        if pg_distrib_dir:
-            pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
-
-        with config_path.open("w") as f:
-            toml.dump(pageserver_config, f)
-
-    # neon_local config doesn't have to be backward compatible.  If we're using a dump from before
-    # it supported multiple pageservers, fix it up.
-    if "pageservers" not in snapshot_config:
-        snapshot_config["pageservers"] = [snapshot_config["pageserver"]]
-        del snapshot_config["pageserver"]
-
-    for param in ("listen_http_addr", "listen_pg_addr"):
-        for pageserver in snapshot_config["pageservers"]:
-            pageserver[param] = port_distributor.replace_with_new_port(pageserver[param])
-    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["broker"]["listen_addr"]
-    )
-    for sk in snapshot_config["safekeepers"]:
-        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
-            sk[param] = port_distributor.replace_with_new_port(sk[param])
-
-    if pg_distrib_dir:
-        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
-
-    with snapshot_config_toml.open("w") as f:
-        toml.dump(snapshot_config, f)
-
-    # Ensure that snapshot doesn't contain references to the original path
-    rv = subprocess.run(
-        [
-            "grep",
-            "--recursive",
-            "--binary-file=without-match",
-            "--files-with-matches",
-            "test_create_snapshot/repo",
-            str(repo_dir),
-        ],
-        capture_output=True,
-        text=True,
-    )
-    assert (
-        rv.returncode != 0
-    ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
-
-
-def check_neon_works(
-    repo_dir: Path,
-    neon_target_binpath: Path,
-    neon_current_binpath: Path,
-    pg_distrib_dir: Path,
-    pg_version: PgVersion,
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    pg_bin: PgBin,
-    request: FixtureRequest,
-):
-    snapshot_config_toml = repo_dir / "config"
-    snapshot_config = toml.load(snapshot_config_toml)
-    snapshot_config["neon_distrib_dir"] = str(neon_target_binpath)
-    snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir)
-    with (snapshot_config_toml).open("w") as f:
-        toml.dump(snapshot_config, f)
-
-    # TODO: replace with NeonEnvBuilder / NeonEnv
-    config: Any = type("NeonEnvStub", (object,), {})
-    config.rust_log_override = None
-    config.repo_dir = repo_dir
-    config.pg_version = pg_version
-    config.initial_tenant = snapshot_config["default_tenant_id"]
-    config.pg_distrib_dir = pg_distrib_dir
-    config.remote_storage = None
-    config.safekeepers_remote_storage = None
-
-    # Use the "target" binaries to launch the storage nodes
-    config_target = config
-    config_target.neon_binpath = neon_target_binpath
-    # We are using maybe-old binaries for neon services, but want to use current
-    # binaries for test utilities like neon_local
-    config_target.neon_local_binpath = neon_current_binpath
-    cli_target = NeonCli(config_target)
-
-    # And the current binaries to launch computes
-    snapshot_config["neon_distrib_dir"] = str(neon_current_binpath)
-    with (snapshot_config_toml).open("w") as f:
-        toml.dump(snapshot_config, f)
-    config_current = copy.copy(config)
-    config_current.neon_binpath = neon_current_binpath
-    cli_current = NeonCli(config_current)
-
-    cli_target.raw_cli(["start"])
-    request.addfinalizer(lambda: cli_target.raw_cli(["stop"]))
-
-    pg_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
-    cli_current.endpoint_create(
-        branch_name="main", pg_port=pg_port, http_port=http_port, endpoint_id="ep-main"
-    )
-    cli_current.endpoint_start("ep-main")
-    request.addfinalizer(lambda: cli_current.endpoint_stop("ep-main"))
-
-    connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
+    connstr = ep.connstr()
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]
     )
     initial_dump_differs = dump_differs(
-        repo_dir.parent / "dump.sql",
+        sql_dump_path,
         test_output_dir / "dump.sql",
         test_output_dir / "dump.filediff",
     )
 
     # Check that project can be recovered from WAL
     # loosely based on https://www.notion.so/neondatabase/Storage-Recovery-from-WAL-d92c0aac0ebf40df892b938045d7d720
-    tenant_id = snapshot_config["default_tenant_id"]
-    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
-    pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1]
-    pageserver_http = PageserverHttpClient(
-        port=pageserver_port,
-        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
-    )
+    pageserver_http = env.pageserver.http_client()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    pg_version = env.pg_version
 
     shutil.rmtree(repo_dir / "local_fs_remote_storage")
     timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
@@ -494,6 +307,11 @@ def dump_differs(
     Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
     """
 
+    if not first.exists():
+        raise FileNotFoundError(f"{first} doesn't exist")
+    if not second.exists():
+        raise FileNotFoundError(f"{second} doesn't exist")
+
     with output.open("w") as stdout:
         res = subprocess.run(
             [

From dfb0a6fdaf8eecbfacf5b1d9a5dc62ec26bd64c9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 12 Dec 2023 16:53:08 +0000
Subject: [PATCH 078/202] scrubber: handle initdb files, fix an issue with
 prefixes (#6079)

- The code for calculating the prefix in the bucket was expecting a
trailing slash (as it is in the tests), but that's an awkward
expectation to impose for use in the field: make the code more flexible
by only trimming a trailing character if it is indeed a slash.
- initdb archives were detected by the scrubber as malformed layer
files. Teach it to recognize and ignore them.
---
 s3_scrubber/src/checks.rs        | 22 +++++++++++++++++++++-
 s3_scrubber/src/lib.rs           |  4 +++-
 s3_scrubber/src/main.rs          | 13 ++++++++++++-
 s3_scrubber/src/scan_metadata.rs |  4 ++++
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 510a128663..a15a908212 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -142,7 +142,9 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         .collect();
 
                     if !orphan_layers.is_empty() {
-                        result.errors.push(format!(
+                        // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
+                        // these as a hint that there is something worth cleaning up here.
+                        result.warnings.push(format!(
                             "index_part.json does not contain layers from S3: {:?}",
                             orphan_layers
                                 .iter()
@@ -170,6 +172,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         ));
                     }
                 }
+                BlobDataParseResult::Relic => {}
                 BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
                     parse_errors
                         .into_iter()
@@ -215,6 +218,8 @@ pub(crate) enum BlobDataParseResult {
         index_part_generation: Generation,
         s3_layers: HashSet<(LayerFileName, Generation)>,
     },
+    /// The remains of a deleted Timeline (i.e. an initdb archive only)
+    Relic,
     Incorrect(Vec<String>),
 }
 
@@ -245,6 +250,7 @@ pub(crate) async fn list_timeline_blobs(
     timeline_dir_target.delimiter = String::new();
 
     let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
+    let mut initdb_archive: bool = false;
 
     let stream = stream_listing(s3_client, &timeline_dir_target);
     pin_mut!(stream);
@@ -258,6 +264,10 @@ pub(crate) async fn list_timeline_blobs(
                 tracing::info!("Index key {key}");
                 index_parts.push(obj)
             }
+            Some("initdb.tar.zst") => {
+                tracing::info!("initdb archive {key}");
+                initdb_archive = true;
+            }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
                     tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
@@ -279,6 +289,16 @@ pub(crate) async fn list_timeline_blobs(
         }
     }
 
+    if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive {
+        tracing::info!(
+            "Timeline is empty apart from initdb archive: expected post-deletion state."
+        );
+        return Ok(S3TimelineBlobData {
+            blob_data: BlobDataParseResult::Relic,
+            keys_to_remove: Vec::new(),
+        });
+    }
+
     // Choose the index_part with the highest generation
     let (index_part_object, index_part_generation) = match index_parts
         .iter()
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index e5465952fb..6607db21e6 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -86,7 +86,9 @@ impl S3Target {
         if new_self.prefix_in_bucket.is_empty() {
             new_self.prefix_in_bucket = format!("/{}/", new_segment);
         } else {
-            let _ = new_self.prefix_in_bucket.pop();
+            if new_self.prefix_in_bucket.ends_with('/') {
+                new_self.prefix_in_bucket.pop();
+            }
             new_self.prefix_in_bucket =
                 [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
         }
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index 1f0ceebdaf..ef020edc2a 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -57,7 +57,7 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json } => match scan_metadata(bucket_config).await {
+        Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
             Err(e) => {
                 tracing::error!("Failed: {e}");
                 Err(e)
@@ -70,6 +70,17 @@ async fn main() -> anyhow::Result<()> {
                 }
                 if summary.is_fatal() {
                     Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                } else if summary.is_empty() {
+                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                    // scrubber they were likely expecting to scan something, and if we see no timelines
+                    // at all then it's likely due to some configuration issues like a bad prefix
+                    Err(anyhow::anyhow!(
+                        "No timelines found in bucket {} prefix {}",
+                        bucket_config.bucket,
+                        bucket_config
+                            .prefix_in_bucket
+                            .unwrap_or("<none>".to_string())
+                    ))
                 } else {
                     Ok(())
                 }
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index ad82db1e76..228f8d6763 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -174,6 +174,10 @@ Timeline layer count: {6}
     pub fn is_fatal(&self) -> bool {
         !self.with_errors.is_empty()
     }
+
+    pub fn is_empty(&self) -> bool {
+        self.count == 0
+    }
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.

From 5820faaa876c35b0504b1f5136d851437651a765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 12 Dec 2023 18:00:37 +0100
Subject: [PATCH 079/202] Use extend instead of groups of append calls in tests
 (#6109)

Repeated calls to `.append` don't line up as nicely as they might get
formatted in different ways. Also, it is more characters and the lines
might be longer.

Saw this while working on #5912.
---
 test_runner/regress/test_auth.py              | 11 ++--
 test_runner/regress/test_branch_behind.py     |  5 +-
 test_runner/regress/test_branching.py         | 20 +++---
 test_runner/regress/test_import.py            | 13 ++--
 .../test_pageserver_metric_collection.py      | 28 ++++----
 test_runner/regress/test_remote_storage.py    | 25 +++----
 test_runner/regress/test_tenant_delete.py     | 14 ++--
 test_runner/regress/test_tenant_detach.py     | 12 ++--
 test_runner/regress/test_tenant_relocation.py | 17 ++---
 .../test_tenants_with_remote_storage.py       | 23 ++++---
 .../regress/test_threshold_based_eviction.py  | 13 ++--
 test_runner/regress/test_timeline_delete.py   | 66 ++++++++++---------
 12 files changed, 133 insertions(+), 114 deletions(-)

diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 7487106c44..bd87ff3efd 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -92,8 +92,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
-    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+    env.pageserver.allowed_errors.extend(
+        [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"]
+    )
 
     pageserver_token_old = env.auth_keys.generate_pageserver_token()
     pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
@@ -145,9 +146,9 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
-    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
-
+    env.pageserver.allowed_errors.extend(
+        [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"]
+    )
     pageserver_token_old = env.auth_keys.generate_pageserver_token()
     pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index a19b2862f8..9879254897 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -14,8 +14,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
-    env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
+    env.pageserver.allowed_errors.extend(
+        [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
+    )
 
     # Branch at the point where only 100 rows were inserted
     branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index a908dd713a..82ca985d01 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -148,11 +148,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
     env = neon_env_builder.init_configs()
     env.start()
 
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
+            ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
+        ]
     )
     ps_http = env.pageserver.http_client()
 
@@ -247,11 +247,11 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
     env = neon_env_builder.init_configs()
     env.start()
 
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
+            ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory",
+        ]
     )
     ps_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 920e8d0b72..faedf5d944 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -99,12 +99,13 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         ]
     )
 
-    # FIXME: we should clean up pageserver to not print this
-    env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*")
-
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            # FIXME: we should clean up pageserver to not print this
+            ".*exited with error: unexpected message type: CopyData.*",
+            # FIXME: Is this expected?
+            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
+        ]
     )
 
     def import_tar(base, wal):
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index b76dbbee03..042961baa5 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -64,13 +64,13 @@ def test_metric_collection(
     # spin up neon,  after http server is ready
     env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     # httpserver is shut down before pageserver during passing run
-    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    # we have a fast rate of calculation, these can happen at shutdown
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*metrics endpoint refused the sent metrics*",
+            # we have a fast rate of calculation, these can happen at shutdown
+            ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
+            ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
+        ]
     )
 
     tenant_id = env.initial_tenant
@@ -212,13 +212,13 @@ def test_metric_collection_cleans_up_tempfile(
     pageserver_http = env.pageserver.http_client()
 
     # httpserver is shut down before pageserver during passing run
-    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    # we have a fast rate of calculation, these can happen at shutdown
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*metrics endpoint refused the sent metrics*",
+            # we have a fast rate of calculation, these can happen at shutdown
+            ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
+            ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
+        ]
     )
 
     tenant_id = env.initial_tenant
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 0a5046e219..3004d69f50 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -73,19 +73,20 @@ def test_remote_storage_backup_and_restore(
     ##### First start, insert data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
-    # FIXME: Is this expected?
-    env.pageserver.allowed_errors.append(
-        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            # FIXME: Is this expected?
+            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
+            ".*No timelines to attach received.*",
+            ".*Failed to get local tenant state.*",
+            # FIXME retry downloads without throwing errors
+            ".*failed to load remote timeline.*",
+            # we have a bunch of pytest.raises for these below
+            ".*tenant .*? already exists, state:.*",
+            ".*tenant directory already exists.*",
+            ".*simulated failure of remote operation.*",
+        ]
     )
-    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
-
-    env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
-    # FIXME retry downloads without throwing errors
-    env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
-    # we have a bunch of pytest.raises for these below
-    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
-    env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
-    env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")
 
     pageserver_http = env.pageserver.http_client()
     endpoint = env.endpoints.create_start("main")
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 48f5682371..fece876459 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -395,13 +395,13 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     env.start()
     pageserver_http = env.pageserver.http_client()
 
-    # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
-    env.pageserver.allowed_errors.append(
-        ".*Timeline got dropped without initializing, cleaning its files"
-    )
-    # the response hit_pausable_failpoint_and_later_fail
-    env.pageserver.allowed_errors.append(
-        f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
+    env.pageserver.allowed_errors.extend(
+        [
+            # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
+            ".*Timeline got dropped without initializing, cleaning its files",
+            # the response hit_pausable_failpoint_and_later_fail
+            f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn",
+        ]
     )
 
     env.pageserver.tenant_create(env.initial_tenant)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 5b63bd6161..0dcbb23ad4 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -307,10 +307,14 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
 
-    # the error will be printed to the log too
-    env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
-    # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
-    env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
+    env.pageserver.allowed_errors.extend(
+        [
+            # the error will be printed to the log too
+            ".*gc target timeline does not exist.*",
+            # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
+            ".*InternalServerError\\(timeline is Stopping.*",
+        ]
+    )
 
     # Detach while running manual GC.
     # It should wait for manual GC to finish because it runs in a task associated with the tenant.
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index feacdcc802..dcd7232b1b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -216,16 +216,17 @@ def test_tenant_relocation(
 
     tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
 
-    # FIXME: Is this expected?
-    env.pageservers[0].allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    env.pageservers[0].allowed_errors.extend(
+        [
+            # FIXME: Is this expected?
+            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
+            # Needed for detach polling on the original pageserver
+            f".*NotFound: tenant {tenant_id}.*",
+            # We will dual-attach in this test, so stale generations are expected
+            ".*Dropped remote consistent LSN updates.*",
+        ]
     )
 
-    # Needed for detach polling on the original pageserver
-    env.pageservers[0].allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
-    # We will dual-attach in this test, so stale generations are expected
-    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
 
     # we use two branches to check that they are both relocated
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index b7b4e2be0b..07fb6dc5ca 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -117,10 +117,12 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
-    # FIXME: Are these expected?
-    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
-    env.pageserver.allowed_errors.append(
-        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            # FIXME: Are these expected?
+            ".*No timelines to attach received.*",
+            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
+        ]
     )
 
     pageserver_http = env.pageserver.http_client()
@@ -218,13 +220,14 @@ def test_tenant_redownloads_truncated_file_on_startup(
 
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
 
-    env.pageserver.allowed_errors.append(".*removing local file .* because .*")
-
-    # FIXME: Are these expected?
-    env.pageserver.allowed_errors.append(
-        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*removing local file .* because .*",
+            # FIXME: Are these expected?
+            ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
+            ".*No timelines to attach received.*",
+        ]
     )
-    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
 
     pageserver_http = env.pageserver.http_client()
     endpoint = env.endpoints.create_start("main")
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 27d5cce5f2..5f72cfd747 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -36,12 +36,13 @@ def test_threshold_based_eviction(
         ".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*"
     )
     env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(metrics_refused_log_line)
-
-    # these can happen whenever we run consumption metrics collection
-    env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
-    env.pageserver.allowed_errors.append(
-        r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
+    env.pageserver.allowed_errors.extend(
+        [
+            metrics_refused_log_line,
+            # these can happen whenever we run consumption metrics collection
+            r".*failed to calculate logical size at \S+: cancelled",
+            r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes",
+        ]
     )
 
     tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 17113a6bc5..c6d578a7a2 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -39,10 +39,14 @@ from urllib3.util.retry import Retry
 def test_timeline_delete(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
-    env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
-    env.pageserver.allowed_errors.append(".*timeline not found.*")
-    env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
-    env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*")
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Timeline .* was not found.*",
+            ".*timeline not found.*",
+            ".*Cannot delete timeline which has child timelines.*",
+            ".*Precondition failed: Requested tenant is missing.*",
+        ]
+    )
 
     ps_http = env.pageserver.http_client()
 
@@ -198,22 +202,22 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             ),
         )
 
-    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
-    # It appears when we stopped flush loop during deletion and then pageserver is stopped
-    env.pageserver.allowed_errors.append(
-        ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+    env.pageserver.allowed_errors.extend(
+        [
+            f".*{timeline_id}.*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion and then pageserver is stopped
+            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # This happens when we fail before scheduling background operation.
+            # Timeline is left in stopping state and retry tries to stop it again.
+            ".*Ignoring new state, equal to the existing one: Stopping",
+            # This happens when we retry delete requests for broken timelines
+            ".*Ignoring state update Stopping for broken timeline",
+            # This happens when timeline remains are cleaned up during loading
+            ".*Timeline dir entry become invalid.*",
+            # In one of the branches we poll for tenant to become active. Polls can generate this log message:
+            f".*Tenant {env.initial_tenant} is not active*",
+        ]
     )
-    # This happens when we fail before scheduling background operation.
-    # Timeline is left in stopping state and retry tries to stop it again.
-    env.pageserver.allowed_errors.append(
-        ".*Ignoring new state, equal to the existing one: Stopping"
-    )
-    # This happens when we retry delete requests for broken timelines
-    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
-    # This happens when timeline remains are cleaned up during loading
-    env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
-    # In one of the branches we poll for tenant to become active. Polls can generate this log message:
-    env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
 
     ps_http.configure_failpoints((failpoint, "return"))
 
@@ -398,13 +402,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
 
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm")
-    env.pageserver.allowed_errors.append(
-        ".*Ignoring new state, equal to the existing one: Stopping"
-    )
-    # this happens, because the stuck timeline is visible to shutdown
-    env.pageserver.allowed_errors.append(
-        ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*failpoint: timeline-delete-before-rm",
+            ".*Ignoring new state, equal to the existing one: Stopping",
+            # this happens, because the stuck timeline is visible to shutdown
+            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+        ]
     )
 
     ps_http = env.pageserver.http_client()
@@ -551,10 +555,12 @@ def test_concurrent_timeline_delete_stuck_on(
         with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
             ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
         assert second_call_err.value.status_code == 409
-        env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
-        # the second call will try to transition the timeline into Stopping state as well
-        env.pageserver.allowed_errors.append(
-            f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+        env.pageserver.allowed_errors.extend(
+            [
+                f".*{child_timeline_id}.*{error_msg_re}.*",
+                # the second call will try to transition the timeline into Stopping state as well
+                f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping",
+            ]
         )
         log.info("second call failed as expected")
 

From 7c2c87a5abb8f7710c4efa2c8ba60c36a17a1cc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 12 Dec 2023 18:20:12 +0100
Subject: [PATCH 080/202] Update azure SDK to 0.18 and use open range support
 (#6103)

* Update `azure-*` crates to 0.18
* Use new open ranges support added by upstream in
https://github.com/Azure/azure-sdk-for-rust/pull/1482

Part of #5567. Prior update PR: #6081
---
 Cargo.lock                            | 42 ++++++++++++++++++---------
 Cargo.toml                            |  8 ++---
 libs/remote_storage/src/azure_blob.rs | 15 ++++------
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index da480b8e0c..1b6b423444 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -694,9 +694,9 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.17.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ccd63c07d1fbfb3d4543d7ea800941bf5a30db1911b9b9e4db3b2c4210a434f"
+checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
 dependencies = [
  "async-trait",
  "base64 0.21.1",
@@ -704,8 +704,10 @@ dependencies = [
  "dyn-clone",
  "futures",
  "getrandom 0.2.11",
+ "hmac",
  "http-types",
  "log",
+ "once_cell",
  "paste",
  "pin-project",
  "quick-xml",
@@ -714,6 +716,7 @@ dependencies = [
  "rustc_version",
  "serde",
  "serde_json",
+ "sha2",
  "time",
  "url",
  "uuid",
@@ -721,9 +724,9 @@ dependencies = [
 
 [[package]]
 name = "azure_identity"
-version = "0.17.0"
+version = "0.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bd7ea32ca7eb66ff4757f83baac702ff11d469e5de365b6bc6f79f9c25d3436"
+checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -733,7 +736,6 @@ dependencies = [
  "oauth2",
  "pin-project",
  "serde",
- "serde_json",
  "time",
  "tz-rs",
  "url",
@@ -742,21 +744,18 @@ dependencies = [
 
 [[package]]
 name = "azure_storage"
-version = "0.17.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83ca0a07f89fd72a006da4713e93af3d6c44a693e61a1c3c2e7985de39c182e8"
+checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
 dependencies = [
  "RustyXML",
+ "async-lock",
  "async-trait",
  "azure_core",
  "bytes",
- "futures",
- "hmac",
  "log",
  "serde",
  "serde_derive",
- "serde_json",
- "sha2",
  "time",
  "url",
  "uuid",
@@ -764,13 +763,14 @@ dependencies = [
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.17.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8096c04d370118323c42b2752aa1883e4880a56ef65239f317b359f263b6e194"
+checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
 dependencies = [
  "RustyXML",
  "azure_core",
  "azure_storage",
+ "azure_svc_blobstorage",
  "bytes",
  "futures",
  "log",
@@ -782,6 +782,22 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "azure_svc_blobstorage"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
+dependencies = [
+ "azure_core",
+ "bytes",
+ "futures",
+ "log",
+ "once_cell",
+ "serde",
+ "serde_json",
+ "time",
+]
+
 [[package]]
 name = "backtrace"
 version = "0.3.67"
diff --git a/Cargo.toml b/Cargo.toml
index b5eece5e35..496a9d7839 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,10 +38,10 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-azure_core = "0.17"
-azure_identity = "0.17"
-azure_storage = "0.17"
-azure_storage_blobs = "0.17"
+azure_core = "0.18"
+azure_identity = "0.18"
+azure_storage = "0.18"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index e559d00ded..548bde02f6 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -271,17 +271,12 @@ impl RemoteStorage for AzureBlobStorage {
 
         let mut builder = blob_client.get();
 
-        if let Some(end_exclusive) = end_exclusive {
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        let range: Range = if let Some(end_exclusive) = end_exclusive {
+            (start_inclusive..end_exclusive).into()
         } else {
-            // Open ranges are not supported by the SDK so we work around
-            // by setting the upper limit extremely high (but high enough
-            // to still be representable by signed 64 bit integers).
-            // TODO remove workaround once the SDK adds open range support
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
-            let end_exclusive = u64::MAX / 4;
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
-        }
+            (start_inclusive..).into()
+        };
+        builder = builder.range(range);
 
         self.download_for_builder(builder).await
     }

From 8460654f61d59dc639db282473148e2b8ef50b09 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 13 Dec 2023 03:52:10 +0200
Subject: [PATCH 081/202] Add per-endpoint rate limiter to proxy

---
 proxy/src/auth.rs                 |  8 ++++
 proxy/src/bin/proxy.rs            |  4 ++
 proxy/src/config.rs               |  1 +
 proxy/src/proxy.rs                | 21 +++++++++
 proxy/src/rate_limiter.rs         |  1 +
 proxy/src/rate_limiter/limiter.rs | 71 +++++++++++++++++++++++++++++++
 proxy/src/serverless.rs           |  7 +++
 proxy/src/serverless/websocket.rs |  4 ++
 8 files changed, 117 insertions(+)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 7d79d34045..eadb9abd43 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -62,6 +62,9 @@ pub enum AuthErrorImpl {
         Please add it to the allowed list in the Neon console."
     )]
     IpAddressNotAllowed,
+
+    #[error("Too many connections to this endpoint. Please try again later.")]
+    TooManyConnections,
 }
 
 #[derive(Debug, Error)]
@@ -80,6 +83,10 @@ impl AuthError {
     pub fn ip_address_not_allowed() -> Self {
         AuthErrorImpl::IpAddressNotAllowed.into()
     }
+
+    pub fn too_many_connections() -> Self {
+        AuthErrorImpl::TooManyConnections.into()
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -102,6 +109,7 @@ impl UserFacingError for AuthError {
             MissingEndpointName => self.to_string(),
             Io(_) => "Internal error".to_string(),
             IpAddressNotAllowed => self.to_string(),
+            TooManyConnections => self.to_string(),
         }
     }
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index fc1c44809a..1fa2d5599f 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -112,6 +112,9 @@ struct ProxyCliArgs {
     /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     rate_limiter_timeout: tokio::time::Duration,
+    /// Endpoint rate limiter max number of requests per second.
+    #[clap(long, default_value_t = 300)]
+    endpoint_rps_limit: u32,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
     initial_limit: usize,
@@ -317,6 +320,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         authentication_config,
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
+        endpoint_rps_limit: args.endpoint_rps_limit,
     }));
 
     Ok(config)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 182d71f9be..dea446eb22 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -20,6 +20,7 @@ pub struct ProxyConfig {
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
+    pub endpoint_rps_limit: u32,
 }
 
 #[derive(Debug)]
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 152c894ca9..ae8b294841 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -9,6 +9,7 @@ use crate::{
     console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
     http::StatusCode,
     protocol2::WithClientIp,
+    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
 };
@@ -307,6 +308,7 @@ pub async fn task_main(
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     let cancel_map = Arc::new(CancelMap::default());
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -315,6 +317,8 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancel_map = Arc::clone(&cancel_map);
+        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+
         connections.spawn(
             async move {
                 info!("accepted postgres client connection");
@@ -340,6 +344,7 @@ pub async fn task_main(
                     socket,
                     ClientMode::Tcp,
                     peer_addr.ip(),
+                    endpoint_rate_limiter,
                 )
                 .await
             }
@@ -415,6 +420,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     peer_addr: IpAddr,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     info!(
         protocol = mode.protocol_label(),
@@ -463,6 +469,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         &params,
         session_id,
         mode.allow_self_signed_compute(config),
+        endpoint_rate_limiter,
     );
     cancel_map
         .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
@@ -928,6 +935,8 @@ struct Client<'a, S> {
     session_id: uuid::Uuid,
     /// Allow self-signed certificates (for testing).
     allow_self_signed_compute: bool,
+    /// Rate limiter for endpoints
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
 impl<'a, S> Client<'a, S> {
@@ -938,6 +947,7 @@ impl<'a, S> Client<'a, S> {
         params: &'a StartupMessageParams,
         session_id: uuid::Uuid,
         allow_self_signed_compute: bool,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         Self {
             stream,
@@ -945,6 +955,7 @@ impl<'a, S> Client<'a, S> {
             params,
             session_id,
             allow_self_signed_compute,
+            endpoint_rate_limiter,
         }
     }
 }
@@ -966,8 +977,18 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             params,
             session_id,
             allow_self_signed_compute,
+            endpoint_rate_limiter,
         } = self;
 
+        // check rate limit
+        if let Some(ep) = creds.get_endpoint() {
+            if !endpoint_rate_limiter.check(ep) {
+                return stream
+                    .throw_error(auth::AuthError::too_many_connections())
+                    .await;
+            }
+        }
+
         let proto = mode.protocol_label();
         let extra = console::ConsoleReqExtra {
             session_id, // aka this connection's id
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 5622c44a68..f40b8dbd1c 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -3,4 +3,5 @@ mod limit_algorithm;
 mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
+pub use limiter::EndpointRateLimiter;
 pub use limiter::Limiter;
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 3a9fed3919..9d28bb67b3 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -6,6 +6,9 @@ use std::{
     time::Duration,
 };
 
+use dashmap::DashMap;
+use parking_lot::Mutex;
+use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Instant};
 use tracing::info;
@@ -15,6 +18,74 @@ use super::{
     RateLimiterConfig,
 };
 
+// Simple per-endpoint rate limiter.
+//
+// Check that number of connections to the endpoint is below `max_rps` rps.
+// Purposefully ignore user name and database name as clients can reconnect
+// with different names, so we'll end up sending some http requests to
+// the control plane.
+//
+// We also may save quite a lot of CPU (I think) by bailing out right after we
+// saw SNI, before doing TLS handshake. User-side error messages in that case
+// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
+// I went with a more expensive way that yields user-friendlier error messages.
+//
+// TODO: add a better bucketing here, e.g. not more than 300 requests per second,
+//       and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects
+//       are noramal during redeployments, so we should not block them.
+pub struct EndpointRateLimiter {
+    map: DashMap<SmolStr, Arc<Mutex<(chrono::NaiveTime, u32)>>>,
+    max_rps: u32,
+    access_count: AtomicUsize,
+}
+
+impl EndpointRateLimiter {
+    pub fn new(max_rps: u32) -> Self {
+        Self {
+            map: DashMap::new(),
+            max_rps,
+            access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
+        }
+    }
+
+    /// Check that number of connections to the endpoint is below `max_rps` rps.
+    pub fn check(&self, endpoint: SmolStr) -> bool {
+        // do GC every 100k requests (worst case memory usage is about 10MB)
+        if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 {
+            self.do_gc();
+        }
+
+        let now = chrono::Utc::now().naive_utc().time();
+        let entry = self
+            .map
+            .entry(endpoint)
+            .or_insert_with(|| Arc::new(Mutex::new((now, 0))));
+        let mut entry = entry.lock();
+        let (last_time, count) = *entry;
+
+        if now - last_time < chrono::Duration::seconds(1) {
+            if count >= self.max_rps {
+                return false;
+            }
+            *entry = (last_time, count + 1);
+        } else {
+            *entry = (now, 1);
+        }
+        true
+    }
+
+    /// Clean the map. Simple strategy: remove all entries. At worst, we'll
+    /// double the effective max_rps during the cleanup. But that way deletion
+    /// does not aquire mutex on each entry access.
+    pub fn do_gc(&self) {
+        info!(
+            "cleaning up endpoint rate limiter, current size = {}",
+            self.map.len()
+        );
+        self.map.clear();
+    }
+}
+
 /// Limits the number of concurrent jobs.
 ///
 /// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index cd496ff01e..92d6e2d851 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -14,6 +14,7 @@ use tokio_util::task::TaskTracker;
 
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
+use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
@@ -43,6 +44,7 @@ pub async fn task_main(
     }
 
     let conn_pool = conn_pool::GlobalConnPool::new(config);
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));
 
     // shutdown the connection pool
     tokio::spawn({
@@ -91,6 +93,7 @@ pub async fn task_main(
             let sni_name = tls.server_name().map(|s| s.to_string());
             let conn_pool = conn_pool.clone();
             let ws_connections = ws_connections.clone();
+            let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
             async move {
                 let peer_addr = match client_addr {
@@ -103,6 +106,7 @@ pub async fn task_main(
                         let sni_name = sni_name.clone();
                         let conn_pool = conn_pool.clone();
                         let ws_connections = ws_connections.clone();
+                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
                         async move {
                             let cancel_map = Arc::new(CancelMap::default());
@@ -117,6 +121,7 @@ pub async fn task_main(
                                 session_id,
                                 sni_name,
                                 peer_addr.ip(),
+                                endpoint_rate_limiter,
                             )
                             .instrument(info_span!(
                                 "serverless",
@@ -190,6 +195,7 @@ async fn request_handler(
     session_id: uuid::Uuid,
     sni_hostname: Option<String>,
     peer_addr: IpAddr,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Body>, ApiError> {
     let host = request
         .headers()
@@ -214,6 +220,7 @@ async fn request_handler(
                     session_id,
                     host,
                     peer_addr,
+                    endpoint_rate_limiter,
                 )
                 .await
                 {
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 199b03550d..cd6184cdee 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,6 +3,7 @@ use crate::{
     config::ProxyConfig,
     error::io_error,
     proxy::{handle_client, ClientMode},
+    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -13,6 +14,7 @@ use pin_project_lite::pin_project;
 use std::{
     net::IpAddr,
     pin::Pin,
+    sync::Arc,
     task::{ready, Context, Poll},
 };
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
@@ -134,6 +136,7 @@ pub async fn serve_websocket(
     session_id: uuid::Uuid,
     hostname: Option<String>,
     peer_addr: IpAddr,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     handle_client(
@@ -143,6 +146,7 @@ pub async fn serve_websocket(
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         peer_addr,
+        endpoint_rate_limiter,
     )
     .await?;
     Ok(())

From c8316b7a3fd713b04c8a764df5a859befab99b95 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 13 Dec 2023 13:53:23 +0000
Subject: [PATCH 082/202] simplify endpoint limiter (#6122)

## Problem

1. Using chrono for durations only is wasteful
2. The arc/mutex was not being utilised
3. Locking every shard in the dashmap every GC could cause latency
spikes
4. More buckets

## Summary of changes

1. Use `Instant` instead of `NaiveTime`.
2. Remove the `Arc<Mutex<_>>` wrapper, utilising that dashmap entry
returns mut access
3. Clear only a random shard, update gc interval accordingly
4. Multiple buckets can be checked before allowing access

When I benchmarked the check function, it took on average 811ns when
multithreaded over the course of 10 million checks.
---
 proxy/src/proxy.rs                |   7 +-
 proxy/src/rate_limiter.rs         |   2 +-
 proxy/src/rate_limiter/limiter.rs | 118 +++++++++++++++++++++---------
 proxy/src/serverless.rs           |   8 +-
 4 files changed, 94 insertions(+), 41 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ae8b294841..72ebc5d3be 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -9,7 +9,7 @@ use crate::{
     console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
     http::StatusCode,
     protocol2::WithClientIp,
-    rate_limiter::EndpointRateLimiter,
+    rate_limiter::{EndpointRateLimiter, RateBucketInfo},
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
 };
@@ -308,7 +308,10 @@ pub async fn task_main(
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     let cancel_map = Arc::new(CancelMap::default());
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new([RateBucketInfo::new(
+        config.endpoint_rps_limit,
+        time::Duration::from_secs(1),
+    )]));
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index f40b8dbd1c..b26386d159 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -3,5 +3,5 @@ mod limit_algorithm;
 mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
-pub use limiter::EndpointRateLimiter;
 pub use limiter::Limiter;
+pub use limiter::{EndpointRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 9d28bb67b3..e493082796 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,16 +1,13 @@
-use std::{
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    },
-    time::Duration,
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc,
 };
 
 use dashmap::DashMap;
-use parking_lot::Mutex;
+use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
-use tokio::time::{timeout, Instant};
+use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
 use super::{
@@ -32,57 +29,106 @@ use super::{
 //
 // TODO: add a better bucketing here, e.g. not more than 300 requests per second,
 //       and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects
-//       are noramal during redeployments, so we should not block them.
+//       are normal during redeployments, so we should not block them.
 pub struct EndpointRateLimiter {
-    map: DashMap<SmolStr, Arc<Mutex<(chrono::NaiveTime, u32)>>>,
-    max_rps: u32,
+    map: DashMap<SmolStr, Vec<RateBucket>>,
+    info: Vec<RateBucketInfo>,
     access_count: AtomicUsize,
 }
 
-impl EndpointRateLimiter {
-    pub fn new(max_rps: u32) -> Self {
+#[derive(Clone, Copy)]
+struct RateBucket {
+    start: Instant,
+    count: u32,
+}
+
+impl RateBucket {
+    fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
+        if now - self.start < info.interval {
+            self.count < info.max_rpi
+        } else {
+            // bucket expired, reset
+            self.count = 0;
+            self.start = now;
+
+            true
+        }
+    }
+
+    fn inc(&mut self) {
+        self.count += 1;
+    }
+}
+
+pub struct RateBucketInfo {
+    interval: Duration,
+    // requests per interval
+    max_rpi: u32,
+}
+
+impl RateBucketInfo {
+    pub fn new(max_rps: u32, interval: Duration) -> Self {
         Self {
-            map: DashMap::new(),
-            max_rps,
+            interval,
+            max_rpi: max_rps * 1000 / interval.as_millis() as u32,
+        }
+    }
+}
+
+impl EndpointRateLimiter {
+    pub fn new(info: impl IntoIterator<Item = RateBucketInfo>) -> Self {
+        Self {
+            info: info.into_iter().collect(),
+            map: DashMap::with_shard_amount(64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
         }
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
     pub fn check(&self, endpoint: SmolStr) -> bool {
-        // do GC every 100k requests (worst case memory usage is about 10MB)
-        if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 {
+        // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
+        // worst case memory usage is about:
+        //    = 2 * 2048 * 64 * (48B + 72B)
+        //    = 30MB
+        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
             self.do_gc();
         }
 
-        let now = chrono::Utc::now().naive_utc().time();
-        let entry = self
-            .map
-            .entry(endpoint)
-            .or_insert_with(|| Arc::new(Mutex::new((now, 0))));
-        let mut entry = entry.lock();
-        let (last_time, count) = *entry;
+        let now = Instant::now();
+        let mut entry = self.map.entry(endpoint).or_insert_with(|| {
+            vec![
+                RateBucket {
+                    start: now,
+                    count: 0,
+                };
+                self.info.len()
+            ]
+        });
 
-        if now - last_time < chrono::Duration::seconds(1) {
-            if count >= self.max_rps {
-                return false;
-            }
-            *entry = (last_time, count + 1);
-        } else {
-            *entry = (now, 1);
+        let should_allow_request = entry
+            .iter_mut()
+            .zip(&self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+
+        if should_allow_request {
+            // only increment the bucket counts if the request will actually be accepted
+            entry.iter_mut().for_each(RateBucket::inc);
         }
-        true
+
+        should_allow_request
     }
 
-    /// Clean the map. Simple strategy: remove all entries. At worst, we'll
-    /// double the effective max_rps during the cleanup. But that way deletion
-    /// does not aquire mutex on each entry access.
+    /// Clean the map. Simple strategy: remove all entries in a random shard.
+    /// At worst, we'll double the effective max_rps during the cleanup.
+    /// But that way deletion does not aquire mutex on each entry access.
     pub fn do_gc(&self) {
         info!(
             "cleaning up endpoint rate limiter, current size = {}",
             self.map.len()
         );
-        self.map.clear();
+        let n = self.map.shards().len();
+        let shard = thread_rng().gen_range(0..n);
+        self.map.shards()[shard].write().clear();
     }
 }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 92d6e2d851..daac396ed6 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,11 +10,12 @@ use anyhow::bail;
 use hyper::StatusCode;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time;
 use tokio_util::task::TaskTracker;
 
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
-use crate::rate_limiter::EndpointRateLimiter;
+use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
@@ -44,7 +45,10 @@ pub async fn task_main(
     }
 
     let conn_pool = conn_pool::GlobalConnPool::new(config);
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new([RateBucketInfo::new(
+        config.endpoint_rps_limit,
+        time::Duration::from_secs(1),
+    )]));
 
     // shutdown the connection pool
     tokio::spawn({

From e3778381a8eaac0b43238c1fc5ade20d459c20aa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 13 Dec 2023 14:14:38 +0000
Subject: [PATCH 083/202] tests: make test_bulk_insert recreate tenant in same
 generation (#6113)

## Problem

Test deletes tenant and recreates with the same ID. The recreation bumps
generation number. This could lead to stale generation warnings in the
logs.

## Summary of changes

Handle this more gracefully by re-creating in the same generation that
the tenant was previously attached in.

We could also update the tenant delete path to have the attachment
service to drop tenant state on delete, but I like having it there: it
makes debug easier, and the only time it's a problem is when a test is
re-using a tenant ID after deletion.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 test_runner/fixtures/neon_fixtures.py       |  7 ++++---
 test_runner/performance/test_bulk_insert.py | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4b23650960..7dfdd9274d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1870,11 +1870,12 @@ class NeonPageserver(PgProtocol):
         tenant_id: TenantId,
         conf: Optional[Dict[str, Any]] = None,
         auth_token: Optional[str] = None,
+        generation: Optional[int] = None,
     ) -> TenantId:
+        if generation is None:
+            generation = self.maybe_get_generation(tenant_id)
         client = self.http_client(auth_token=auth_token)
-        return client.tenant_create(
-            tenant_id, conf, generation=self.maybe_get_generation(tenant_id)
-        )
+        return client.tenant_create(tenant_id, conf, generation=generation)
 
     def tenant_load(self, tenant_id: TenantId):
         client = self.http_client()
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index a146e011cc..a2a1fa11e5 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -55,9 +55,20 @@ def measure_recovery_time(env: NeonCompare):
 
     # Delete the Tenant in the pageserver: this will drop local and remote layers, such that
     # when we "create" the Tenant again, we will replay the WAL from the beginning.
+    #
+    # This is a "weird" thing to do, and can confuse the attachment service as we're re-using
+    # the same tenant ID for a tenant that is logically different from the pageserver's point
+    # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
+    # we will explicitly create the tenant in the same generation that it was previously
+    # attached in.
+    assert env.env.attachment_service is not None
+    attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
+    assert attach_status is not None
+    (attach_gen, _) = attach_status
+
     client.tenant_delete(env.tenant)
     wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
-    env.env.pageserver.tenant_create(tenant_id=env.tenant)
+    env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)
 
     # Measure recovery time
     with env.record_duration("wal_recovery"):

From 2d226610612e3e125f8450c0ca65d33d10aa8774 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 13 Dec 2023 16:23:19 +0200
Subject: [PATCH 084/202] refactor: calculate_synthetic_size_worker, remove
 PRE::NeedsDownload (#6111)

Changes I wanted to make on #6106 but decided to leave out to keep that
commit clean for including in the #6090. Finally remove
`PageReconstructionError::NeedsDownload`.
---
 pageserver/src/consumption_metrics.rs | 61 ++++++++++++++++-----------
 pageserver/src/http/routes.rs         |  5 ---
 pageserver/src/tenant/timeline.rs     | 21 +--------
 3 files changed, 37 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 8f2b88d191..bde2cedca7 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -256,8 +256,6 @@ async fn calculate_synthetic_size_worker(
         info!("calculate_synthetic_size_worker stopped");
     };
 
-    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
-
     loop {
         let started_at = Instant::now();
 
@@ -280,29 +278,14 @@ async fn calculate_synthetic_size_worker(
                 continue;
             }
 
-            if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) {
-                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
-                // We can put in some prioritization for consumption metrics.
-                // Same for the loop that fetches computed metrics.
-                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-                // which turns out is really handy to understand the system.
-                if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
-                    // this error can be returned if timeline is shutting down, but it does not
-                    // mean the synthetic size worker should terminate. we do not need any checks
-                    // in this function because `mgr::get_tenant` will error out after shutdown has
-                    // progressed to shutting down tenants.
-                    let is_cancelled = matches!(
-                        e.downcast_ref::<PageReconstructError>(),
-                        Some(PageReconstructError::Cancelled)
-                    );
+            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
+                continue;
+            };
 
-                    if !is_cancelled {
-                        error!(
-                            "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
-                        );
-                    }
-                }
-            }
+            // there is never any reason to exit calculate_synthetic_size_worker following any
+            // return value -- we don't need to care about shutdown because no tenant is found when
+            // pageserver is shut down.
+            calculate_and_log(&tenant, cancel, ctx).await;
         }
 
         crate::tenant::tasks::warn_when_period_overrun(
@@ -321,3 +304,31 @@ async fn calculate_synthetic_size_worker(
         }
     }
 }
+
+async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+    const CAUSE: LogicalSizeCalculationCause =
+        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
+
+    // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+    // We can put in some prioritization for consumption metrics.
+    // Same for the loop that fetches computed metrics.
+    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+    // which turns out is really handy to understand the system.
+    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
+        return;
+    };
+
+    // this error can be returned if timeline is shutting down, but it does not
+    // mean the synthetic size worker should terminate. we do not need any checks
+    // in this function because `mgr::get_tenant` will error out after shutdown has
+    // progressed to shutting down tenants.
+    let shutting_down = matches!(
+        e.downcast_ref::<PageReconstructError>(),
+        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+    );
+
+    if !shutting_down {
+        let tenant_shard_id = tenant.tenant_shard_id();
+        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+    }
+}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fee50460a5..9faacaef89 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -136,11 +136,6 @@ impl From<PageReconstructError> for ApiError {
     fn from(pre: PageReconstructError) -> ApiError {
         match pre {
             PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
-            PageReconstructError::NeedsDownload(_, _) => {
-                // This shouldn't happen, because we use a RequestContext that requests to
-                // download any missing layer files on-demand.
-                ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
-            }
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 81dbc04793..a2a31f395e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -29,7 +29,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{id::TenantTimelineId, sync::gate::Gate};
+use utils::sync::gate::Gate;
 
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
@@ -377,9 +377,6 @@ pub enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 
-    /// The operation would require downloading a layer that is missing locally.
-    NeedsDownload(TenantTimelineId, LayerFileName),
-
     /// The operation was cancelled
     Cancelled,
 
@@ -408,14 +405,6 @@ impl std::fmt::Debug for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
             Self::Other(err) => err.fmt(f),
-            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
-                write!(
-                    f,
-                    "layer {}/{} needs download",
-                    tenant_timeline_id,
-                    layer_file_name.file_name()
-                )
-            }
             Self::Cancelled => write!(f, "cancelled"),
             Self::AncestorStopping(timeline_id) => {
                 write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -429,14 +418,6 @@ impl std::fmt::Display for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
             Self::Other(err) => err.fmt(f),
-            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
-                write!(
-                    f,
-                    "layer {}/{} needs download",
-                    tenant_timeline_id,
-                    layer_file_name.file_name()
-                )
-            }
             Self::Cancelled => write!(f, "cancelled"),
             Self::AncestorStopping(timeline_id) => {
                 write!(f, "ancestor timeline {timeline_id} is being stopped")

From a919b863d172308846a9dae10d3c8fee25d322be Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 13 Dec 2023 18:05:33 +0200
Subject: [PATCH 085/202] refactor: remove eviction batching (#6060)

We no longer have `layer_removal_cs` since #5108, we no longer need
batching.
---
 pageserver/src/disk_usage_eviction_task.rs    | 174 ++++++++----------
 pageserver/src/http/routes.rs                 |  42 +----
 pageserver/src/tenant/timeline.rs             | 127 ++-----------
 .../src/tenant/timeline/eviction_task.rs      |  85 +++++----
 4 files changed, 141 insertions(+), 287 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index f01cd1cf8c..76906cfaf7 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -42,7 +42,6 @@
 //   reading these fields. We use the Debug impl for semi-structured logging, though.
 
 use std::{
-    collections::HashMap,
     sync::Arc,
     time::{Duration, SystemTime},
 };
@@ -125,7 +124,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
     state: &State,
     task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: &GenericRemoteStorage,
     tenants_dir: &Utf8Path,
     cancel: CancellationToken,
 ) {
@@ -149,8 +148,14 @@ async fn disk_usage_eviction_task(
         let start = Instant::now();
 
         async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;
 
             match res {
                 Ok(()) => {}
@@ -181,12 +186,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
     state: &State,
     task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
     tenants_dir: &Utf8Path,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
         .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
     match res {
         Ok(outcome) => {
             debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -268,8 +274,9 @@ struct LayerCount {
     count: usize,
 }
 
-pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     state: &State,
+    _storage: &GenericRemoteStorage,
     usage_pre: U,
     cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -321,16 +328,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // Walk through the list of candidates, until we have accumulated enough layers to get
     // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
     // how much disk space would be used after evicting all the layers up to the current
-    // point in the list. The layers are collected in 'batched', grouped per timeline.
+    // point in the list.
     //
     // If we get far enough in the list that we start to evict layers that are below
     // the tenant's min-resident-size threshold, print a warning, and memorize the disk
     // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
     let mut warned = None;
     let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
-    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
+    let mut evicted_amount = 0;
+
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
         if !usage_planned.has_pressure() {
             debug!(
                 no_candidates_evicted = i,
@@ -339,25 +346,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
             break;
         }
 
-        if partition == MinResidentSizePartition::Below && warned.is_none() {
+        if partition == &MinResidentSizePartition::Below && warned.is_none() {
             warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
             warned = Some(usage_planned);
         }
 
         usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-
-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
-        // tasks to evict all seen layers until we have evicted enough
-
-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
-
-        // semaphore will later be used to limit eviction concurrency, and we can express at
-        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-        // but fail gracefully by not making batches larger.
-        if batch.len() < u32::MAX as usize {
-            batch.push(candidate.layer);
-            max_batch_size = max_batch_size.max(batch.len());
-        }
+        evicted_amount += 1;
     }
 
     let usage_planned = match warned {
@@ -372,100 +367,79 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     };
     debug!(?usage_planned, "usage planned");
 
-    // phase2: evict victims batched by timeline
+    // phase2: evict layers
 
     let mut js = tokio::task::JoinSet::new();
+    let limit = 1000;
 
-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+    let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
+    let mut consumed_all = false;
 
-    for (timeline, batch) in batched {
-        let tenant_shard_id = timeline.tenant_shard_id;
-        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+    // After the evictions, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
 
-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+    let evict_layers = async move {
+        loop {
+            let next = if js.len() >= limit || consumed_all {
+                js.join_next().await
+            } else if !js.is_empty() {
+                // opportunistically consume ready result, one per each new evicted
+                futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
+            } else {
+                None
+            };
 
-        debug!(%timeline_id, "evicting batch for timeline");
-
-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
-
-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
-                            }
-                        }
+            if let Some(next) = next {
+                match next {
+                    Ok(Ok(file_size)) => {
+                        usage_assumed.add_available_bytes(file_size);
                     }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
+                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                        evictions_failed.file_sizes += file_size;
+                        evictions_failed.count += 1;
                     }
+                    Err(je) if je.is_cancelled() => unreachable!("not used"),
+                    Err(je) if je.is_panic() => { /* already logged */ }
+                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
                 }
-                (evicted_bytes, evictions_failed)
             }
-        }
-        .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));
 
-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            if consumed_all && js.is_empty() {
+                break;
             }
+
+            // calling again when consumed_all is fine as evicted is fused.
+            let Some((_partition, candidate)) = evicted.next() else {
+                consumed_all = true;
+                continue;
+            };
+
+            js.spawn(async move {
+                let rtc = candidate.timeline.remote_client.as_ref().expect(
+                    "holding the witness, all timelines must have a remote timeline client",
+                );
+                let file_size = candidate.layer.layer_desc().file_size;
+                candidate
+                    .layer
+                    .evict_and_wait(rtc)
+                    .await
+                    .map(|()| file_size)
+                    .map_err(|e| (file_size, e))
+            });
+
+            tokio::task::yield_now().await;
         }
+
         (usage_assumed, evictions_failed)
     };
 
     let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
+        tuple = evict_layers => { tuple },
         _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+            // dropping joinset will abort all pending evict_and_waits and that is fine, our
+            // requests will still stand
             return Ok(IterationOutcome::Cancelled);
         }
     };
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9faacaef89..da7e8218a2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1588,7 +1588,7 @@ async fn always_panic_handler(
 
 async fn disk_usage_eviction_run(
     mut r: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     check_permission(&r, None)?;
 
@@ -1625,48 +1625,26 @@ async fn disk_usage_eviction_run(
         freed_bytes: 0,
     };
 
-    let (tx, rx) = tokio::sync::oneshot::channel();
-
     let state = get_state(&r);
 
-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.as_ref() else {
         return Err(ApiError::InternalServerError(anyhow::anyhow!(
             "remote storage not configured, cannot run eviction iteration"
         )));
-    }
+    };
 
     let state = state.disk_usage_eviction_state.clone();
 
-    let cancel = CancellationToken::new();
-    let child_cancel = cancel.clone();
-    let _g = cancel.drop_guard();
+    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
+        &state, storage, usage, &cancel,
+    )
+    .await;
 
-    crate::task_mgr::spawn(
-        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::DiskUsageEviction,
-        None,
-        None,
-        "ondemand disk usage eviction",
-        false,
-        async move {
-            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-                &state,
-                usage,
-                &child_cancel,
-            )
-            .await;
+    info!(?res, "disk_usage_eviction_task_iteration_impl finished");
 
-            info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+    let res = res.map_err(ApiError::InternalServerError)?;
 
-            let _ = tx.send(res);
-            Ok(())
-        }
-        .in_current_span(),
-    );
-
-    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
+    json_response(StatusCode::OK, res)
 }
 
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a2a31f395e..d0241eafd5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1099,8 +1099,9 @@ impl Timeline {
         Ok(Some(true))
     }
 
-    /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
-    /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
+    /// Evict just one layer.
+    ///
+    /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
     pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
@@ -1111,109 +1112,17 @@ impl Timeline {
             return Ok(None);
         };
 
-        let Some(local_layer) = local_layer.keep_resident().await? else {
-            return Ok(Some(false));
-        };
-
-        let local_layer: Layer = local_layer.into();
-
-        let remote_client = self
+        let rtc = self
             .remote_client
             .as_ref()
             .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
 
-        let results = self
-            .evict_layer_batch(remote_client, &[local_layer])
-            .await?;
-        assert_eq!(results.len(), 1);
-        let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
-        match result {
-            None => anyhow::bail!("task_mgr shutdown requested"),
-            Some(Ok(())) => Ok(Some(true)),
-            Some(Err(e)) => Err(anyhow::Error::new(e)),
+        match local_layer.evict_and_wait(rtc).await {
+            Ok(()) => Ok(Some(true)),
+            Err(EvictionError::NotFound) => Ok(Some(false)),
+            Err(EvictionError::Downloaded) => Ok(Some(false)),
         }
     }
-
-    /// Evict a batch of layers.
-    pub(crate) async fn evict_layers(
-        &self,
-        layers_to_evict: &[Layer],
-    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| anyhow::anyhow!("Shutting down"))?;
-
-        let remote_client = self
-            .remote_client
-            .as_ref()
-            .context("timeline must have RemoteTimelineClient")?;
-
-        self.evict_layer_batch(remote_client, layers_to_evict).await
-    }
-
-    /// Evict multiple layers at once, continuing through errors.
-    ///
-    /// The `remote_client` should be this timeline's `self.remote_client`.
-    /// We make the caller provide it so that they are responsible for handling the case
-    /// where someone wants to evict the layer but no remote storage is configured.
-    ///
-    /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
-    /// If `Err()` is returned, no eviction was attempted.
-    /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
-    /// Meaning of each `result[i]`:
-    /// - `Some(Err(...))` if layer replacement failed for some reason
-    ///    - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
-    /// - `Some(Ok(()))` if everything went well.
-    /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
-    async fn evict_layer_batch(
-        &self,
-        remote_client: &Arc<RemoteTimelineClient>,
-        layers_to_evict: &[Layer],
-    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
-        {
-            // to avoid racing with detach and delete_timeline
-            let state = self.current_state();
-            anyhow::ensure!(
-                state == TimelineState::Active,
-                "timeline is not active but {state:?}"
-            );
-        }
-
-        let mut results = Vec::with_capacity(layers_to_evict.len());
-        for _ in 0..layers_to_evict.len() {
-            results.push(None);
-        }
-
-        let mut js = tokio::task::JoinSet::new();
-
-        for (i, l) in layers_to_evict.iter().enumerate() {
-            js.spawn({
-                let l = l.to_owned();
-                let remote_client = remote_client.clone();
-                async move { (i, l.evict_and_wait(&remote_client).await) }
-            });
-        }
-
-        let join = async {
-            while let Some(next) = js.join_next().await {
-                match next {
-                    Ok((i, res)) => results[i] = Some(res),
-                    Err(je) if je.is_cancelled() => unreachable!("not used"),
-                    Err(je) if je.is_panic() => { /* already logged */ }
-                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-                }
-            }
-        };
-
-        tokio::select! {
-            _ = self.cancel.cancelled() => {},
-            _ = join => {}
-        }
-
-        assert_eq!(results.len(), layers_to_evict.len());
-        Ok(results)
-    }
 }
 
 /// Number of times we will compute partition within a checkpoint distance.
@@ -4586,7 +4495,7 @@ mod tests {
             .await
             .unwrap();
 
-        let rc = timeline
+        let rtc = timeline
             .remote_client
             .clone()
             .expect("just configured this");
@@ -4599,16 +4508,12 @@ mod tests {
             .expect("should had been resident")
             .drop_eviction_guard();
 
-        let batch = [layer];
-
-        let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
-        let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
+        let first = async { layer.evict_and_wait(&rtc).await };
+        let second = async { layer.evict_and_wait(&rtc).await };
 
         let (first, second) = tokio::join!(first, second);
 
-        let (first, second) = (only_one(first), only_one(second));
-
-        let res = batch[0].keep_resident().await;
+        let res = layer.keep_resident().await;
         assert!(matches!(res, Ok(None)), "{res:?}");
 
         match (first, second) {
@@ -4629,14 +4534,6 @@ mod tests {
         RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
     }
 
-    fn only_one<T>(mut input: Vec<Option<T>>) -> T {
-        assert_eq!(1, input.len());
-        input
-            .pop()
-            .expect("length just checked")
-            .expect("no cancellation")
-    }
-
     async fn find_some_layer(timeline: &Timeline) -> Layer {
         let layers = timeline.layers.read().await;
         let desc = layers
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 020c5a9e9f..782e8f9e39 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -212,11 +212,21 @@ impl Timeline {
         // Gather layers for eviction.
         // NB: all the checks can be invalidated as soon as we release the layer map lock.
         // We don't want to hold the layer map lock during eviction.
+
         // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+
+        let remote_client = match self.remote_client.as_ref() {
+            Some(c) => c,
+            None => {
+                error!("no remote storage configured, cannot evict layers");
+                return ControlFlow::Continue(());
+            }
+        };
+
+        let mut js = tokio::task::JoinSet::new();
+        {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
-            let mut candidates = Vec::new();
             for hist_layer in layers.iter_historic_layers() {
                 let hist_layer = guard.get_from_desc(&hist_layer);
 
@@ -262,54 +272,49 @@ impl Timeline {
                         continue;
                     }
                 };
+                let layer = guard.drop_eviction_guard();
                 if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    let remote_client = remote_client.clone();
+                    // this could cause a lot of allocations in some cases
+                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
+                    stats.candidates += 1;
                 }
             }
-            candidates
-        };
-        stats.candidates = candidates.len();
-
-        let remote_client = match self.remote_client.as_ref() {
-            None => {
-                error!(
-                    num_candidates = candidates.len(),
-                    "no remote storage configured, cannot evict layers"
-                );
-                return ControlFlow::Continue(());
-            }
-            Some(c) => c,
         };
 
-        let results = match self.evict_layer_batch(remote_client, &candidates).await {
-            Err(pre_err) => {
-                stats.errors += candidates.len();
-                error!("could not do any evictions: {pre_err:#}");
-                return ControlFlow::Continue(());
+        let join_all = async move {
+            while let Some(next) = js.join_next().await {
+                match next {
+                    Ok(Ok(())) => stats.evicted += 1,
+                    Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                        stats.not_evictable += 1;
+                    }
+                    Err(je) if je.is_cancelled() => unreachable!("not used"),
+                    Err(je) if je.is_panic() => {
+                        /* already logged */
+                        stats.errors += 1;
+                    }
+                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+                }
             }
-            Ok(results) => results,
+            stats
         };
-        assert_eq!(results.len(), candidates.len());
-        for result in results {
-            match result {
-                None => {
-                    stats.skipped_for_shutdown += 1;
-                }
-                Some(Ok(())) => {
-                    stats.evicted += 1;
-                }
-                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                    stats.not_evictable += 1;
+
+        tokio::select! {
+            stats = join_all => {
+                if stats.candidates == stats.not_evictable {
+                    debug!(stats=?stats, "eviction iteration complete");
+                } else if stats.errors > 0 || stats.not_evictable > 0 {
+                    warn!(stats=?stats, "eviction iteration complete");
+                } else {
+                    info!(stats=?stats, "eviction iteration complete");
                 }
             }
+            _ = cancel.cancelled() => {
+                // just drop the joinset to "abort"
+            }
         }
-        if stats.candidates == stats.not_evictable {
-            debug!(stats=?stats, "eviction iteration complete");
-        } else if stats.errors > 0 || stats.not_evictable > 0 {
-            warn!(stats=?stats, "eviction iteration complete");
-        } else {
-            info!(stats=?stats, "eviction iteration complete");
-        }
+
         ControlFlow::Continue(())
     }
 

From c2528ae671d3a636d0a993ef35952e9d294923c1 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 13 Dec 2023 17:23:24 +0100
Subject: [PATCH 086/202] Increase pgbouncer pool size to 64 for VMs (#6124)

The pool size was changed for pods
(https://github.com/neondatabase/cloud/pull/8057). The idea to increase
it for VMs too
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 6f0ebe5f66..804405293f 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -34,7 +34,7 @@ files:
       server_tls_sslmode=disable
       pool_mode=transaction
       max_client_conn=10000
-      default_pool_size=16
+      default_pool_size=64
       max_prepared_statements=0
   - filename: cgconfig.conf
     content: |

From d6fcc18eb2dae12b63fedf84190a4b2cea23f7ab Mon Sep 17 00:00:00 2001
From: George MacKerron <george@mackerron.co.uk>
Date: Wed, 13 Dec 2023 17:18:20 +0000
Subject: [PATCH 087/202] Add Neon-Batch- headers to OPTIONS response for
 SQL-over-HTTP requests (#6116)

This is needed to allow use of batch queries from browsers.

## Problem

SQL-over-HTTP batch queries fail from web browsers because the relevant
headers, `Neon-Batch-isolation-Level` and `Neon-Batch-Read-Only`, are
not included in the server's OPTIONS response. I think we simply forgot
to add them when implementing the batch query feature.

## Summary of changes

Added `Neon-Batch-Isolation-Level` and `Neon-Batch-Read-Only` to the
OPTIONS response.
---
 proxy/src/serverless.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index daac396ed6..bfb4dc44eb 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -252,7 +252,7 @@ async fn request_handler(
             .header("Access-Control-Allow-Origin", "*")
             .header(
                 "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
+                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code

From 85720616b1567dbf4c444ed21ce7c23572c0ce42 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 8 Nov 2023 15:34:07 -0600
Subject: [PATCH 088/202] Update Postgres 14 series to 14.10

---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index dd067cf656..0bb356aa0c 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit dd067cf656f6810a25aca6025633d32d02c5085a
+Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c4cea208ee..490ffbd50d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "e3a22b72922055f9212eca12700190f118578362",
     "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a",
-    "postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a"
+    "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
 }

From c94db6adbb74e3e8bf55269d7a2c93e6e6b874d4 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 8 Nov 2023 15:43:06 -0600
Subject: [PATCH 089/202] Update Postgres 15 series to 15.5

---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index bc88f53931..24333abb81 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit bc88f539312fcc4bb292ce94ae9db09ab6656e8a
+Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 490ffbd50d..b2af616497 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
     "postgres-v16": "e3a22b72922055f9212eca12700190f118578362",
-    "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a",
+    "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
     "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
 }

From 158adf602e81d1901f4abee13cc752b05331cb1c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 8 Nov 2023 15:54:10 -0600
Subject: [PATCH 090/202] Update Postgres 16 series to 16.1

---
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index e3a22b7292..863b71572b 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit e3a22b72922055f9212eca12700190f118578362
+Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb
diff --git a/vendor/revisions.json b/vendor/revisions.json
index b2af616497..a9575a2cb7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "e3a22b72922055f9212eca12700190f118578362",
+    "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb",
     "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
     "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
 }

From f401a21cf656375bb04b33723fd75dc2fdd941d0 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 9 Nov 2023 20:18:07 +0000
Subject: [PATCH 091/202] Fix test_simple_sync_safekeepers

There is a postgres 16 version encoded in a binary message.
---
 libs/walproposer/src/walproposer.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 0661d3a969..f5723018d7 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -436,9 +436,9 @@ mod tests {
                 event_mask: 0,
             }),
             expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                 vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                     147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                     188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
         // walproposer will panic when it finishes sync_safekeepers
         std::panic::catch_unwind(|| wp.start()).unwrap_err();
         // validate the resulting LSN
-        assert_eq!(receiver.recv()?, 1337);
+        assert_eq!(receiver.try_recv(), Ok(1337));
         Ok(())
         // drop() will free up resources here
     }

From 904dff58b508df7e98261ca2c72e975736aef922 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 6 Dec 2023 23:00:09 +0000
Subject: [PATCH 092/202] test_wal_restore_http: cleanup test

---
 test_runner/regress/test_wal_restore.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 4a9ffeee4b..7d03f644d1 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,7 +1,6 @@
 import sys
 import tarfile
 import tempfile
-import time
 from pathlib import Path
 
 import pytest
@@ -12,6 +11,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     VanillaPostgres,
 )
+from fixtures.pageserver.utils import timeline_delete_wait_completed
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import LocalFsStorage
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -128,10 +128,7 @@ def test_wal_restore_initdb(
         assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
 
 
-def test_wal_restore_http(
-    neon_env_builder: NeonEnvBuilder,
-    test_output_dir: Path,
-):
+def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("create table t as select generate_series(1,300000)")
@@ -145,15 +142,7 @@ def test_wal_restore_http(
 
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
 
-    test_output_dir / "initdb.tar.zst"
-
-    (env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst")
-
-    ps_client.timeline_delete(tenant_id, timeline_id)
-    time.sleep(2)
-
-    # verify that it is indeed deleted
-    # TODO
+    timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
 
     # issue the restoration command
     ps_client.timeline_create(

From 0cd49cac84d8edfa670935a84bb1e271245c6623 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 6 Dec 2023 23:01:12 +0000
Subject: [PATCH 093/202] test_compatibility: make it use initdb.tar.zst

---
 test_runner/regress/test_compatibility.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 3f5de100fd..5a9c2782e6 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -273,9 +273,24 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     timeline_id = env.initial_timeline
     pg_version = env.pg_version
 
-    shutil.rmtree(repo_dir / "local_fs_remote_storage")
+    # Delete all files from local_fs_remote_storage except initdb.tar.zst,
+    # the file is required for `timeline_create` with `existing_initdb_timeline_id`.
+    #
+    # TODO: switch to Path.walk() in Python 3.12
+    # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
+    for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
+        for filename in filenames:
+            if filename != "initdb.tar.zst":
+                (Path(dirpath) / filename).unlink()
+
     timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
-    pageserver_http.timeline_create(pg_version, tenant_id, timeline_id)
+    pageserver_http.timeline_create(
+        pg_version=pg_version,
+        tenant_id=tenant_id,
+        new_timeline_id=timeline_id,
+        existing_initdb_timeline_id=timeline_id,
+    )
+
     pg_bin.run_capture(
         ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
     )

From 6987b5c44e128dc4690ae7290b770fa29b1ac65b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 13 Dec 2023 21:43:49 +0000
Subject: [PATCH 094/202] proxy: add more rates to endpoint limiter (#6130)

## Problem

Single rate bucket is limited in usefulness

## Summary of changes

Introduce a secondary bucket allowing an average of 200 requests per
second over 1 minute, and a tertiary bucket allowing an average of 100
requests per second over 10 minutes.

Configured by using a format like

```sh
proxy --endpoint-rps-limit 300@1s --endpoint-rps-limit 100@10s --endpoint-rps-limit 50@1m
```

If the bucket limits are inconsistent, an error is returned on startup

```
$ proxy --endpoint-rps-limit 300@1s --endpoint-rps-limit 10@10s
Error: invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)
```
---
 proxy/src/bin/proxy.rs            |  47 ++++++++-
 proxy/src/config.rs               |   4 +-
 proxy/src/proxy.rs                |   7 +-
 proxy/src/rate_limiter/limiter.rs | 163 +++++++++++++++++++++++++++---
 proxy/src/serverless.rs           |   8 +-
 5 files changed, 199 insertions(+), 30 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 1fa2d5599f..ae4c42bcb1 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,6 +7,8 @@ use proxy::console;
 use proxy::console::provider::AllowedIpsCache;
 use proxy::console::provider::NodeInfoCache;
 use proxy::http;
+use proxy::rate_limiter::EndpointRateLimiter;
+use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::usage_metrics;
 
@@ -14,6 +16,7 @@ use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
 use std::pin::pin;
+use std::sync::Arc;
 use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
@@ -113,8 +116,11 @@ struct ProxyCliArgs {
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     rate_limiter_timeout: tokio::time::Duration,
     /// Endpoint rate limiter max number of requests per second.
-    #[clap(long, default_value_t = 300)]
-    endpoint_rps_limit: u32,
+    ///
+    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    endpoint_rps_limit: Vec<RateBucketInfo>,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
     initial_limit: usize,
@@ -157,6 +163,8 @@ async fn main() -> anyhow::Result<()> {
     let proxy_listener = TcpListener::bind(proxy_address).await?;
     let cancellation_token = CancellationToken::new();
 
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
+
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
     let mut client_tasks = JoinSet::new();
@@ -164,6 +172,7 @@ async fn main() -> anyhow::Result<()> {
         config,
         proxy_listener,
         cancellation_token.clone(),
+        endpoint_rate_limiter.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -177,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
             config,
             serverless_listener,
             cancellation_token.clone(),
+            endpoint_rate_limiter.clone(),
         ));
     }
 
@@ -311,6 +321,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     let authentication_config = AuthenticationConfig {
         scram_protocol_timeout: args.scram_protocol_timeout,
     };
+
+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
         auth_backend,
@@ -320,8 +334,35 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         authentication_config,
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
-        endpoint_rps_limit: args.endpoint_rps_limit,
+        endpoint_rps_limit,
     }));
 
     Ok(config)
 }
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use clap::Parser;
+    use proxy::rate_limiter::RateBucketInfo;
+
+    #[test]
+    fn parse_endpoint_rps_limit() {
+        let config = super::ProxyCliArgs::parse_from([
+            "proxy",
+            "--endpoint-rps-limit",
+            "100@1s",
+            "--endpoint-rps-limit",
+            "20@30s",
+        ]);
+
+        assert_eq!(
+            config.endpoint_rps_limit,
+            vec![
+                RateBucketInfo::new(100, Duration::from_secs(1)),
+                RateBucketInfo::new(20, Duration::from_secs(30)),
+            ]
+        );
+    }
+}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index dea446eb22..f932df4058 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use crate::auth;
+use crate::{auth, rate_limiter::RateBucketInfo};
 use anyhow::{bail, ensure, Context, Ok};
 use rustls::{sign, Certificate, PrivateKey};
 use sha2::{Digest, Sha256};
@@ -20,7 +20,7 @@ pub struct ProxyConfig {
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
-    pub endpoint_rps_limit: u32,
+    pub endpoint_rps_limit: Vec<RateBucketInfo>,
 }
 
 #[derive(Debug)]
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 72ebc5d3be..396db8f96a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -9,7 +9,7 @@ use crate::{
     console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
     http::StatusCode,
     protocol2::WithClientIp,
-    rate_limiter::{EndpointRateLimiter, RateBucketInfo},
+    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
 };
@@ -297,6 +297,7 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -308,10 +309,6 @@ pub async fn task_main(
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     let cancel_map = Arc::new(CancelMap::default());
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new([RateBucketInfo::new(
-        config.endpoint_rps_limit,
-        time::Duration::from_secs(1),
-    )]));
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index e493082796..87c1597ca9 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -3,7 +3,9 @@ use std::sync::{
     Arc,
 };
 
+use anyhow::bail;
 use dashmap::DashMap;
+use itertools::Itertools;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
@@ -26,13 +28,9 @@ use super::{
 // saw SNI, before doing TLS handshake. User-side error messages in that case
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
-//
-// TODO: add a better bucketing here, e.g. not more than 300 requests per second,
-//       and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects
-//       are normal during redeployments, so we should not block them.
 pub struct EndpointRateLimiter {
     map: DashMap<SmolStr, Vec<RateBucket>>,
-    info: Vec<RateBucketInfo>,
+    info: &'static [RateBucketInfo],
     access_count: AtomicUsize,
 }
 
@@ -60,25 +58,76 @@ impl RateBucket {
     }
 }
 
+#[derive(Clone, Copy, PartialEq)]
 pub struct RateBucketInfo {
-    interval: Duration,
+    pub interval: Duration,
     // requests per interval
-    max_rpi: u32,
+    pub max_rpi: u32,
+}
+
+impl std::fmt::Display for RateBucketInfo {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
+        write!(f, "{rps}@{}", humantime::format_duration(self.interval))
+    }
+}
+
+impl std::fmt::Debug for RateBucketInfo {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self}")
+    }
+}
+
+impl std::str::FromStr for RateBucketInfo {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let Some((max_rps, interval)) = s.split_once('@') else {
+            bail!("invalid rate info")
+        };
+        let max_rps = max_rps.parse()?;
+        let interval = humantime::parse_duration(interval)?;
+        Ok(Self::new(max_rps, interval))
+    }
 }
 
 impl RateBucketInfo {
-    pub fn new(max_rps: u32, interval: Duration) -> Self {
+    pub const DEFAULT_SET: [Self; 3] = [
+        Self::new(300, Duration::from_secs(1)),
+        Self::new(200, Duration::from_secs(60)),
+        Self::new(100, Duration::from_secs(600)),
+    ];
+
+    pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
+        info.sort_unstable_by_key(|info| info.interval);
+        let invalid = info
+            .iter()
+            .tuple_windows()
+            .find(|(a, b)| a.max_rpi > b.max_rpi);
+        if let Some((a, b)) = invalid {
+            bail!(
+                "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
+                b.max_rpi,
+                a.max_rpi,
+            );
+        }
+
+        Ok(())
+    }
+
+    pub const fn new(max_rps: u32, interval: Duration) -> Self {
         Self {
             interval,
-            max_rpi: max_rps * 1000 / interval.as_millis() as u32,
+            max_rpi: max_rps * interval.as_millis() as u32 / 1000,
         }
     }
 }
 
 impl EndpointRateLimiter {
-    pub fn new(info: impl IntoIterator<Item = RateBucketInfo>) -> Self {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        info!(buckets = ?info, "endpoint rate limiter");
         Self {
-            info: info.into_iter().collect(),
+            info,
             map: DashMap::with_shard_amount(64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
         }
@@ -107,7 +156,7 @@ impl EndpointRateLimiter {
 
         let should_allow_request = entry
             .iter_mut()
-            .zip(&self.info)
+            .zip(self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now));
 
         if should_allow_request {
@@ -444,9 +493,11 @@ mod tests {
     use std::{pin::pin, task::Context, time::Duration};
 
     use futures::{task::noop_waker_ref, Future};
+    use smol_str::SmolStr;
+    use tokio::time;
 
-    use super::{Limiter, Outcome};
-    use crate::rate_limiter::RateLimitAlgorithm;
+    use super::{EndpointRateLimiter, Limiter, Outcome};
+    use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
 
     #[tokio::test]
     async fn it_works() {
@@ -555,4 +606,88 @@ mod tests {
         limiter.release(token1, None).await;
         limiter.release(token2, None).await;
     }
+
+    #[test]
+    fn rate_bucket_rpi() {
+        let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5));
+        assert_eq!(rate_bucket.max_rpi, 50 * 5);
+
+        let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500));
+        assert_eq!(rate_bucket.max_rpi, 50 / 2);
+    }
+
+    #[test]
+    fn rate_bucket_parse() {
+        let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap();
+        assert_eq!(rate_bucket.interval, Duration::from_secs(10));
+        assert_eq!(rate_bucket.max_rpi, 100 * 10);
+        assert_eq!(rate_bucket.to_string(), "100@10s");
+
+        let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap();
+        assert_eq!(rate_bucket.interval, Duration::from_secs(60));
+        assert_eq!(rate_bucket.max_rpi, 100 * 60);
+        assert_eq!(rate_bucket.to_string(), "100@1m");
+    }
+
+    #[test]
+    fn default_rate_buckets() {
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
+        RateBucketInfo::validate(&mut defaults[..]).unwrap();
+    }
+
+    #[test]
+    #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
+    fn rate_buckets_validate() {
+        let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
+            .into_iter()
+            .map(|s| s.parse().unwrap())
+            .collect();
+        RateBucketInfo::validate(&mut rates).unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_rate_limits() {
+        let mut rates: Vec<RateBucketInfo> = ["100@1s", "20@30s"]
+            .into_iter()
+            .map(|s| s.parse().unwrap())
+            .collect();
+        RateBucketInfo::validate(&mut rates).unwrap();
+        let limiter = EndpointRateLimiter::new(Vec::leak(rates));
+
+        let endpoint = SmolStr::from("ep-my-endpoint-1234");
+
+        time::pause();
+
+        for _ in 0..100 {
+            assert!(limiter.check(endpoint.clone()));
+        }
+        // more connections fail
+        assert!(!limiter.check(endpoint.clone()));
+
+        // fail even after 500ms as it's in the same bucket
+        time::advance(time::Duration::from_millis(500)).await;
+        assert!(!limiter.check(endpoint.clone()));
+
+        // after a full 1s, 100 requests are allowed again
+        time::advance(time::Duration::from_millis(500)).await;
+        for _ in 1..6 {
+            for _ in 0..100 {
+                assert!(limiter.check(endpoint.clone()));
+            }
+            time::advance(time::Duration::from_millis(1000)).await;
+        }
+
+        // more connections after 600 will exceed the 20rps@30s limit
+        assert!(!limiter.check(endpoint.clone()));
+
+        // will still fail before the 30 second limit
+        time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
+        assert!(!limiter.check(endpoint.clone()));
+
+        // after the full 30 seconds, 100 requests are allowed again
+        time::advance(time::Duration::from_millis(1)).await;
+        for _ in 0..100 {
+            assert!(limiter.check(endpoint.clone()));
+        }
+    }
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index bfb4dc44eb..cdff42b529 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,12 +10,11 @@ use anyhow::bail;
 use hyper::StatusCode;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::time;
 use tokio_util::task::TaskTracker;
 
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
-use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
+use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
@@ -39,16 +38,13 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
     }
 
     let conn_pool = conn_pool::GlobalConnPool::new(config);
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new([RateBucketInfo::new(
-        config.endpoint_rps_limit,
-        time::Duration::from_secs(1),
-    )]));
 
     // shutdown the connection pool
     tokio::spawn({

From c4e0ef507f6979aad673c22afeca2ad0049f2545 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Dec 2023 13:09:24 +0000
Subject: [PATCH 095/202] pageserver: heatmap uploads (#6050)

Dependency (commits inline):
https://github.com/neondatabase/neon/pull/5842

## Problem

Secondary mode tenants need a manifest of what to download. Ultimately
this will be some kind of heat-scored set of layers, but as a robust
first step we will simply use the set of resident layers: secondary
tenant locations will aim to match the on-disk content of the attached
location.

## Summary of changes

- Add heatmap types representing the remote structure
- Add hooks to Tenant/Timeline for generating these heatmaps
- Create a new `HeatmapUploader` type that is external to `Tenant`, and
responsible for walking the list of attached tenants and scheduling
heatmap uploads.

Notes to reviewers:
- Putting the logic for uploads (and later, secondary mode downloads)
outside of `Tenant` is an opinionated choice, motivated by:
- Enable future smarter scheduling of operations, e.g. uploading the
stalest tenant first, rather than having all tenants compete for a fair
semaphore on a first-come-first-served basis. Similarly for downloads,
we may wish to schedule the tenants with the hottest un-downloaded
layers first.
- Enable accessing upload-related state without synchronization (it
belongs to HeatmapUploader, rather than being some Mutex<>'d part of
Tenant)
- Avoid further expanding the scope of Tenant/Timeline types, which are
already among the largest in the codebase
- You might reasonably wonder how much of the uploader code could be a
generic job manager thing. Probably some of it: but let's defer pulling
that out until we have at least two users (perhaps secondary downloads
will be the second one) to highlight which bits are really generic.

Compromises:
- Later, instead of using digests of heatmaps to decide whether anything
changed, I would prefer to avoid walking the layers in tenants that
don't have changes: tracking that will be a bit invasive, as it needs
input from both remote_timeline_client and Layer.
---
 Cargo.lock                                    |   1 +
 control_plane/src/pageserver.rs               |   2 +
 libs/pageserver_api/src/models.rs             |   1 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/src/bin/pageserver.rs              |  14 +-
 pageserver/src/config.rs                      |  31 +-
 pageserver/src/http/openapi_spec.yml          |   2 +
 pageserver/src/http/routes.rs                 |  23 +
 pageserver/src/metrics.rs                     |  22 +
 pageserver/src/task_mgr.rs                    |   3 +
 pageserver/src/tenant.rs                      |  22 +
 pageserver/src/tenant/config.rs               |  12 +
 pageserver/src/tenant/mgr.rs                  |  20 +
 .../src/tenant/remote_timeline_client.rs      |  26 +-
 pageserver/src/tenant/secondary.rs            | 104 ++++
 pageserver/src/tenant/secondary/heatmap.rs    |  64 ++
 .../src/tenant/secondary/heatmap_uploader.rs  | 582 ++++++++++++++++++
 pageserver/src/tenant/timeline.rs             |  52 +-
 test_runner/fixtures/pageserver/http.py       |   4 +
 test_runner/fixtures/remote_storage.py        |   8 +
 .../regress/test_attach_tenant_config.py      |   1 +
 .../regress/test_pageserver_secondary.py      |  45 +-
 22 files changed, 1033 insertions(+), 7 deletions(-)
 create mode 100644 pageserver/src/tenant/secondary.rs
 create mode 100644 pageserver/src/tenant/secondary/heatmap.rs
 create mode 100644 pageserver/src/tenant/secondary/heatmap_uploader.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1b6b423444..71c2c976a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3103,6 +3103,7 @@ dependencies = [
  "humantime-serde",
  "hyper",
  "itertools",
+ "md5",
  "metrics",
  "nix 0.26.2",
  "num-traits",
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 96a41874fd..981ddd3e98 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -407,6 +407,7 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'gc_feedback' as bool")?,
+            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
         };
 
         let request = models::TenantCreateRequest {
@@ -504,6 +505,7 @@ impl PageServerNode {
                     .map(|x| x.parse::<bool>())
                     .transpose()
                     .context("Failed to parse 'gc_feedback' as bool")?,
+                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a3029e67a5..fbc7d73235 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -237,6 +237,7 @@ pub struct TenantConfig {
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
     pub gc_feedback: Option<bool>,
+    pub heatmap_period: Option<String>,
 }
 
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 35c260740c..9e8172c6a1 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7607119dda..f65c4f4580 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
-use pageserver::tenant::TenantSharedResources;
+use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -504,6 +504,17 @@ fn start_pageserver(
         }
     });
 
+    let secondary_controller = if let Some(remote_storage) = &remote_storage {
+        secondary::spawn_tasks(
+            tenant_manager.clone(),
+            remote_storage.clone(),
+            background_jobs_barrier.clone(),
+            shutdown_pageserver.clone(),
+        )
+    } else {
+        secondary::null_controller()
+    };
+
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
     // is still accessible even if background task is not configured as long as remote storage has
@@ -533,6 +544,7 @@ fn start_pageserver(
                 broker_client.clone(),
                 disk_usage_eviction_state,
                 deletion_queue.new_client(),
+                secondary_controller,
             )
             .context("Failed to initialize router state")?,
         );
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 13d1fc775b..cd99cda783 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -70,6 +70,8 @@ pub mod defaults {
     pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
     pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
 
+    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -117,6 +119,8 @@ pub mod defaults {
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false
 
+#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+
 [remote_storage]
 
 "#
@@ -215,6 +219,10 @@ pub struct PageServerConf {
     /// If true, pageserver will make best-effort to operate without a control plane: only
     /// for use in major incidents.
     pub control_plane_emergency_mode: bool,
+
+    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
+    /// heatmap uploads vs. other remote storage operations.
+    pub heatmap_upload_concurrency: usize,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -293,6 +301,8 @@ struct PageServerConfigBuilder {
     control_plane_api: BuilderValue<Option<Url>>,
     control_plane_api_token: BuilderValue<Option<SecretString>>,
     control_plane_emergency_mode: BuilderValue<bool>,
+
+    heatmap_upload_concurrency: BuilderValue<usize>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -361,6 +371,8 @@ impl Default for PageServerConfigBuilder {
             control_plane_api: Set(None),
             control_plane_api_token: Set(None),
             control_plane_emergency_mode: Set(false),
+
+            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
         }
     }
 }
@@ -501,6 +513,10 @@ impl PageServerConfigBuilder {
         self.control_plane_emergency_mode = BuilderValue::Set(enabled)
     }
 
+    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
+        self.heatmap_upload_concurrency = BuilderValue::Set(value)
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_size_logical_size_queries = self
             .concurrent_tenant_size_logical_size_queries
@@ -595,6 +611,10 @@ impl PageServerConfigBuilder {
             control_plane_emergency_mode: self
                 .control_plane_emergency_mode
                 .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+
+            heatmap_upload_concurrency: self
+                .heatmap_upload_concurrency
+                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
         })
     }
 }
@@ -828,7 +848,9 @@ impl PageServerConf {
                 },
                 "control_plane_emergency_mode" => {
                     builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-
+                },
+                "heatmap_upload_concurrency" => {
+                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                 },
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
@@ -896,6 +918,7 @@ impl PageServerConf {
             control_plane_api: None,
             control_plane_api_token: None,
             control_plane_emergency_mode: false,
+            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
         }
     }
 }
@@ -1120,7 +1143,8 @@ background_task_maximum_delay = '334 s'
                 )?,
                 control_plane_api: None,
                 control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_emergency_mode: false,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1177,7 +1201,8 @@ background_task_maximum_delay = '334 s'
                 background_task_maximum_delay: Duration::from_secs(334),
                 control_plane_api: None,
                 control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_emergency_mode: false,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 9422ccb2fd..fbb059c17b 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1405,6 +1405,8 @@ components:
           type: integer
         trace_read_requests:
           type: boolean
+        heatmap_period:
+          type: integer
     TenantConfigResponse:
       type: object
       properties:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index da7e8218a2..618d01dd6a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -42,6 +42,7 @@ use crate::tenant::mgr::{
     GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
     TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
+use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
@@ -75,9 +76,11 @@ pub struct State {
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     deletion_queue_client: DeletionQueueClient,
+    secondary_controller: SecondaryController,
 }
 
 impl State {
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
@@ -86,6 +89,7 @@ impl State {
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
         deletion_queue_client: DeletionQueueClient,
+        secondary_controller: SecondaryController,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
             .iter()
@@ -100,6 +104,7 @@ impl State {
             broker_client,
             disk_usage_eviction_state,
             deletion_queue_client,
+            secondary_controller,
         })
     }
 
@@ -1647,6 +1652,21 @@ async fn disk_usage_eviction_run(
     json_response(StatusCode::OK, res)
 }
 
+async fn secondary_upload_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .upload_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -1906,6 +1926,9 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, evict_timeline_layer_handler),
         )
+        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
+            api_handler(r, secondary_upload_handler)
+        })
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 7cc0333ee5..eefe295f94 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1270,6 +1270,28 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
     )
     .expect("failed to define a metric"),
 });
+pub(crate) struct SecondaryModeMetrics {
+    pub(crate) upload_heatmap: IntCounter,
+    pub(crate) upload_heatmap_errors: IntCounter,
+    pub(crate) upload_heatmap_duration: Histogram,
+}
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+    upload_heatmap: register_int_counter!(
+        "pageserver_secondary_upload_heatmap",
+        "Number of heatmaps written to remote storage by attached tenants"
+    )
+    .expect("failed to define a metric"),
+    upload_heatmap_errors: register_int_counter!(
+        "pageserver_secondary_upload_heatmap_errors",
+        "Failures writing heatmap to remote storage"
+    )
+    .expect("failed to define a metric"),
+    upload_heatmap_duration: register_histogram!(
+        "pageserver_secondary_upload_heatmap_duration",
+        "Time to build and upload a heatmap, including any waiting inside the S3 client"
+    )
+    .expect("failed to define a metric"),
+});
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 8747d9ad50..b80a498c82 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -258,6 +258,9 @@ pub enum TaskKind {
     /// See [`crate::disk_usage_eviction_task`].
     DiskUsageEviction,
 
+    /// See [`crate::tenant::secondary`].
+    SecondaryUploads,
+
     // Initial logical size calculation
     InitialLogicalSizeCalculation,
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a8e8b4cbfa..89d332f178 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -144,6 +144,7 @@ pub mod storage_layer;
 pub mod config;
 pub mod delete;
 pub mod mgr;
+pub mod secondary;
 pub mod tasks;
 pub mod upload_queue;
 
@@ -2114,6 +2115,14 @@ impl Tenant {
             .attach_mode
             .clone()
     }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+
+    pub(crate) fn get_generation(&self) -> Generation {
+        self.generation
+    }
 }
 
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2252,6 +2261,18 @@ impl Tenant {
             .or(self.conf.default_tenant_conf.min_resident_size_override)
     }
 
+    pub fn get_heatmap_period(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let heatmap_period = tenant_conf
+            .heatmap_period
+            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
+        if heatmap_period.is_zero() {
+            None
+        } else {
+            Some(heatmap_period)
+        }
+    }
+
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
         // Don't hold self.timelines.lock() during the notifies.
@@ -3694,6 +3715,7 @@ pub(crate) mod harness {
                     tenant_conf.evictions_low_residence_duration_metric_threshold,
                 ),
                 gc_feedback: Some(tenant_conf.gc_feedback),
+                heatmap_period: Some(tenant_conf.heatmap_period),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 7a454b53d2..25d97f51ce 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -334,6 +334,11 @@ pub struct TenantConf {
     #[serde(with = "humantime_serde")]
     pub evictions_low_residence_duration_metric_threshold: Duration,
     pub gc_feedback: bool,
+
+    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
+    /// may be disabled if a Tenant will not have secondary locations: only secondary
+    /// locations will use the heatmap uploaded by attached locations.
+    pub heatmap_period: Duration,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -414,6 +419,11 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_feedback: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub heatmap_period: Option<Duration>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -482,6 +492,7 @@ impl TenantConfOpt {
                 .evictions_low_residence_duration_metric_threshold
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
             gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
+            heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
         }
     }
 }
@@ -519,6 +530,7 @@ impl Default for TenantConf {
             )
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
             gc_feedback: false,
+            heatmap_period: Duration::ZERO,
         }
     }
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4d7bd4259f..f53951e1d3 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -807,6 +807,12 @@ pub(crate) async fn set_new_tenant_config(
 }
 
 impl TenantManager {
+    /// Convenience function so that anyone with a TenantManager can get at the global configuration, without
+    /// having to pass it around everywhere as a separate object.
+    pub(crate) fn get_conf(&self) -> &'static PageServerConf {
+        self.conf
+    }
+
     /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
     /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
     pub(crate) fn get_attached_tenant_shard(
@@ -1087,6 +1093,20 @@ impl TenantManager {
 
         Ok(())
     }
+
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => Vec::new(),
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map
+                .values()
+                .filter_map(|slot| {
+                    slot.get_attached()
+                        .and_then(|t| if t.is_active() { Some(t.clone()) } else { None })
+                })
+                .collect(),
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 3765ff6e7a..4b271a7395 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,7 +180,7 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
 
-mod download;
+pub(crate) mod download;
 pub mod index;
 mod upload;
 
@@ -1604,6 +1604,23 @@ impl RemoteTimelineClient {
             }
         }
     }
+
+    pub(crate) fn get_layers_metadata(
+        &self,
+        layers: Vec<LayerFileName>,
+    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
+        let q = self.upload_queue.lock().unwrap();
+        let q = match &*q {
+            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", q.as_str())
+            }
+            UploadQueue::Initialized(inner) => inner,
+        };
+
+        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
+
+        Ok(decorated.collect())
+    }
 }
 
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1659,6 +1676,13 @@ pub fn remote_index_path(
     .expect("Failed to construct path")
 }
 
+pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
+pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
+    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
+        .expect("Failed to construct path")
+}
+
 /// Given the key of an index, parse out the generation part of the name
 pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
     let file_name = match path.get_path().file_name() {
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
new file mode 100644
index 0000000000..d25fe56b92
--- /dev/null
+++ b/pageserver/src/tenant/secondary.rs
@@ -0,0 +1,104 @@
+pub mod heatmap;
+mod heatmap_uploader;
+
+use std::sync::Arc;
+
+use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+
+use self::heatmap_uploader::heatmap_uploader_task;
+
+use super::mgr::TenantManager;
+
+use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
+
+use tokio_util::sync::CancellationToken;
+use utils::completion::Barrier;
+
+enum UploadCommand {
+    Upload(TenantShardId),
+}
+
+struct CommandRequest<T> {
+    payload: T,
+    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+}
+
+struct CommandResponse {
+    result: anyhow::Result<()>,
+}
+
+/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
+/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
+/// where we want to immediately upload/download for a particular tenant.  In normal operation
+/// uploads & downloads are autonomous and not driven by this interface.
+pub struct SecondaryController {
+    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+}
+
+impl SecondaryController {
+    async fn dispatch<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
+        payload: T,
+    ) -> anyhow::Result<()> {
+        let (response_tx, response_rx) = tokio::sync::oneshot::channel();
+
+        queue
+            .send(CommandRequest {
+                payload,
+                response_tx,
+            })
+            .await
+            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+
+        let response = response_rx
+            .await
+            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+
+        response.result
+    }
+
+    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
+            .await
+    }
+}
+
+pub fn spawn_tasks(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> SecondaryController {
+    let (upload_req_tx, upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryUploads,
+        None,
+        None,
+        "heatmap uploads",
+        false,
+        async move {
+            heatmap_uploader_task(
+                tenant_manager,
+                remote_storage,
+                upload_req_rx,
+                background_jobs_can_start,
+                cancel,
+            )
+            .await
+        },
+    );
+
+    SecondaryController { upload_req_tx }
+}
+
+/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
+pub fn null_controller() -> SecondaryController {
+    let (upload_req_tx, _upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+    SecondaryController { upload_req_tx }
+}
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
new file mode 100644
index 0000000000..99aaaeb8c8
--- /dev/null
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -0,0 +1,64 @@
+use std::time::SystemTime;
+
+use crate::tenant::{
+    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
+};
+
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
+
+use utils::{generation::Generation, id::TimelineId};
+
+#[derive(Serialize, Deserialize)]
+pub(super) struct HeatMapTenant {
+    /// Generation of the attached location that uploaded the heatmap: this is not required
+    /// for correctness, but acts as a hint to secondary locations in order to detect thrashing
+    /// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
+    pub(super) generation: Generation,
+
+    pub(super) timelines: Vec<HeatMapTimeline>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub(crate) struct HeatMapTimeline {
+    #[serde_as(as = "DisplayFromStr")]
+    pub(super) timeline_id: TimelineId,
+
+    pub(super) layers: Vec<HeatMapLayer>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub(crate) struct HeatMapLayer {
+    pub(super) name: LayerFileName,
+    pub(super) metadata: IndexLayerMetadata,
+
+    #[serde_as(as = "TimestampSeconds<i64>")]
+    pub(super) access_time: SystemTime,
+    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+}
+
+impl HeatMapLayer {
+    pub(crate) fn new(
+        name: LayerFileName,
+        metadata: IndexLayerMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            name,
+            metadata,
+            access_time,
+        }
+    }
+}
+
+impl HeatMapTimeline {
+    pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
+        Self {
+            timeline_id,
+            layers,
+        }
+    }
+}
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
new file mode 100644
index 0000000000..ece2b93ce1
--- /dev/null
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -0,0 +1,582 @@
+use std::{
+    collections::HashMap,
+    sync::{Arc, Weak},
+    time::{Duration, Instant},
+};
+
+use crate::{
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
+        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+    },
+};
+
+use md5;
+use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
+
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::instrument;
+use utils::{backoff, completion::Barrier};
+
+use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+
+/// Period between heatmap uploader walking Tenants to look for work to do.
+/// If any tenants have a heatmap upload period lower than this, it will be adjusted
+/// downward to match.
+const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+
+struct WriteInProgress {
+    barrier: Barrier,
+}
+
+struct UploadPending {
+    tenant: Arc<Tenant>,
+    last_digest: Option<md5::Digest>,
+}
+
+struct WriteComplete {
+    tenant_shard_id: TenantShardId,
+    completed_at: Instant,
+    digest: Option<md5::Digest>,
+    next_upload: Option<Instant>,
+}
+
+/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
+/// when we last did a write.  We only populate this after doing at least one
+/// write for a tenant -- this avoids holding state for tenants that have
+/// uploads disabled.
+
+struct UploaderTenantState {
+    // This Weak only exists to enable culling idle instances of this type
+    // when the Tenant has been deallocated.
+    tenant: Weak<Tenant>,
+
+    /// Digest of the serialized heatmap that we last successfully uploaded
+    ///
+    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+    /// which is also an md5sum.
+    last_digest: Option<md5::Digest>,
+
+    /// When the last upload attempt completed (may have been successful or failed)
+    last_upload: Option<Instant>,
+
+    /// When should we next do an upload?  None means never.
+    next_upload: Option<Instant>,
+}
+
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+
+    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
+    /// limits permit it.
+    tenants_pending: std::collections::VecDeque<UploadPending>,
+
+    /// Tenants for which a task in `tasks` has been spawned.
+    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
+
+    tasks: JoinSet<()>,
+
+    /// Channel for our child tasks to send results to: we use a channel for results rather than
+    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
+    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
+    /// behavior.
+    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
+    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
+
+    concurrent_uploads: usize,
+
+    scheduling_interval: Duration,
+}
+
+/// The uploader task runs a loop that periodically wakes up and schedules tasks for
+/// tenants that require an upload, or handles any commands that have been sent into
+/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
+/// spawn.
+///
+/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
+/// all tenants that require an upload, and in between scheduling iterations we will
+/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
+///
+/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
+/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
+/// we might block waiting on a Tenant.
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+    let mut uploader = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tasks: JoinSet::new(),
+        tenants: HashMap::new(),
+        tenants_pending: std::collections::VecDeque::new(),
+        tenants_uploading: HashMap::new(),
+        task_result_tx: result_tx,
+        task_result_rx: result_rx,
+        concurrent_uploads,
+        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
+    };
+
+    tracing::info!("Waiting for background_jobs_can start...");
+    background_jobs_can_start.wait().await;
+    tracing::info!("background_jobs_can is ready, proceeding.");
+
+    while !cancel.is_cancelled() {
+        // Look for new work: this is relatively expensive because we have to go acquire the lock on
+        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+        // require an upload.
+        uploader.schedule_iteration().await?;
+
+        // Between scheduling iterations, we will:
+        //  - Drain any complete tasks and spawn pending tasks
+        //  - Handle incoming administrative commands
+        //  - Check our cancellation token
+        let next_scheduling_iteration = Instant::now()
+            .checked_add(uploader.scheduling_interval)
+            .unwrap_or_else(|| {
+                tracing::warn!(
+                    "Scheduling interval invalid ({}s), running immediately!",
+                    uploader.scheduling_interval.as_secs_f64()
+                );
+                Instant::now()
+            });
+        loop {
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                    tracing::info!("Heatmap uploader joining tasks");
+                    while let Some(_r) = uploader.tasks.join_next().await {};
+                    tracing::info!("Heatmap uploader terminating");
+
+                    break;
+                },
+                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
+                    break;},
+                cmd = command_queue.recv() => {
+                    tracing::debug!("heatmap_uploader_task: woke for command queue");
+                    let cmd = match cmd {
+                        Some(c) =>c,
+                        None => {
+                            // SecondaryController was destroyed, and this has raced with
+                            // our CancellationToken
+                            tracing::info!("Heatmap uploader terminating");
+                            cancel.cancel();
+                            break;
+                        }
+                    };
+
+                    let CommandRequest{
+                        response_tx,
+                        payload
+                    } = cmd;
+                    uploader.handle_command(payload, response_tx);
+                },
+                _ = uploader.process_next_completion() => {
+                    if !cancel.is_cancelled() {
+                        uploader.spawn_pending();
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+impl HeatmapUploader {
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+        // Cull any entries in self.tenants whose Arc<Tenant> is gone
+        self.tenants
+            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.tenants_pending.clear();
+
+        // Used a fixed 'now' through the following loop, for efficiency and fairness.
+        let now = Instant::now();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        const YIELD_ITERATIONS: usize = 1000;
+
+        // Iterate over tenants looking for work to do.
+        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
+        for (i, tenant) in tenants.into_iter().enumerate() {
+            // Process is shutting down, drop out
+            if self.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Skip tenants that already have a write in flight
+            if self
+                .tenants_uploading
+                .contains_key(tenant.get_tenant_shard_id())
+            {
+                continue;
+            }
+
+            self.maybe_schedule_upload(&now, tenant);
+
+            if i + 1 % YIELD_ITERATIONS == 0 {
+                tokio::task::yield_now().await;
+            }
+        }
+
+        // Spawn tasks for as many of our pending tenants as we can.
+        self.spawn_pending();
+
+        Ok(())
+    }
+
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) {
+        match self.task_result_rx.recv().await {
+            Some(r) => {
+                self.on_completion(r);
+            }
+            None => {
+                unreachable!("Result sender is stored on Self");
+            }
+        }
+    }
+
+    /// The 'maybe' refers to the tenant's state: whether it is configured
+    /// for heatmap uploads at all, and whether sufficient time has passed
+    /// since the last upload.
+    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
+        match tenant.get_heatmap_period() {
+            None => {
+                // Heatmaps are disabled for this tenant
+                return;
+            }
+            Some(period) => {
+                // If any tenant has asked for uploads more frequent than our scheduling interval,
+                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                // we may set rather short intervals.
+                if period < self.scheduling_interval {
+                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
+                }
+            }
+        }
+
+        // Stale attachments do not upload anything: if we are in this state, there is probably some
+        // other attachment in mode Single or Multi running on another pageserver, and we don't
+        // want to thrash and overwrite their heatmap uploads.
+        if tenant.get_attach_mode() == AttachmentMode::Stale {
+            return;
+        }
+
+        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+        // with the completion time in on_completion.
+        let state = self
+            .tenants
+            .entry(*tenant.get_tenant_shard_id())
+            .or_insert_with(|| UploaderTenantState {
+                tenant: Arc::downgrade(&tenant),
+                last_upload: None,
+                next_upload: Some(Instant::now()),
+                last_digest: None,
+            });
+
+        // Decline to do the upload if insufficient time has passed
+        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
+            return;
+        }
+
+        let last_digest = state.last_digest;
+        self.tenants_pending.push_back(UploadPending {
+            tenant,
+            last_digest,
+        })
+    }
+
+    fn spawn_pending(&mut self) {
+        while !self.tenants_pending.is_empty()
+            && self.tenants_uploading.len() < self.concurrent_uploads
+        {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.tenants_pending.pop_front().unwrap();
+            self.spawn_upload(pending.tenant, pending.last_digest);
+        }
+    }
+
+    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
+        let remote_storage = self.remote_storage.clone();
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        let (completion, barrier) = utils::completion::channel();
+        let result_tx = self.task_result_tx.clone();
+        self.tasks.spawn(async move {
+            // Guard for the barrier in [`WriteInProgress`]
+            let _completion = completion;
+
+            let started_at = Instant::now();
+            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
+                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap.inc();
+                    Some(digest)
+                }
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
+                Err(UploadHeatmapError::Upload(e)) => {
+                    tracing::warn!(
+                        "Failed to upload heatmap for tenant {}: {e:#}",
+                        tenant.get_tenant_shard_id(),
+                    );
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap_errors.inc();
+                    last_digest
+                }
+                Err(UploadHeatmapError::Cancelled) => {
+                    tracing::info!("Cancelled heatmap upload, shutting down");
+                    last_digest
+                }
+            };
+
+            let now = Instant::now();
+            let next_upload = tenant
+                .get_heatmap_period()
+                .and_then(|period| now.checked_add(period));
+
+            result_tx
+                .send(WriteComplete {
+                    tenant_shard_id: *tenant.get_tenant_shard_id(),
+                    completed_at: now,
+                    digest,
+                    next_upload,
+                })
+                .ok();
+        });
+
+        self.tenants_uploading
+            .insert(tenant_shard_id, WriteInProgress { barrier });
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
+    fn on_completion(&mut self, completion: WriteComplete) {
+        tracing::debug!("Heatmap upload completed");
+        let WriteComplete {
+            tenant_shard_id,
+            completed_at,
+            digest,
+            next_upload,
+        } = completion;
+        self.tenants_uploading.remove(&tenant_shard_id);
+        use std::collections::hash_map::Entry;
+        match self.tenants.entry(tenant_shard_id) {
+            Entry::Vacant(_) => {
+                // Tenant state was dropped, nothing to update.
+            }
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().last_upload = Some(completed_at);
+                entry.get_mut().last_digest = digest;
+                entry.get_mut().next_upload = next_upload
+            }
+        }
+    }
+
+    fn handle_command(
+        &mut self,
+        command: UploadCommand,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        match command {
+            UploadCommand::Upload(tenant_shard_id) => {
+                // If an upload was ongoing for this tenant, let it finish first.
+                let barrier = if let Some(writing_state) =
+                    self.tenants_uploading.get(&tenant_shard_id)
+                {
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap write to complete");
+                    writing_state.barrier.clone()
+                } else {
+                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
+                    // starting of other background work.
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Starting heatmap write on command");
+                    let tenant = match self
+                        .tenant_manager
+                        .get_attached_tenant_shard(tenant_shard_id, true)
+                    {
+                        Ok(t) => t,
+                        Err(e) => {
+                            // Drop result of send: we don't care if caller dropped their receiver
+                            drop(response_tx.send(CommandResponse {
+                                result: Err(e.into()),
+                            }));
+                            return;
+                        }
+                    };
+                    self.spawn_upload(tenant, None);
+                    let writing_state = self
+                        .tenants_uploading
+                        .get(&tenant_shard_id)
+                        .expect("We just inserted this");
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap upload to complete");
+
+                    writing_state.barrier.clone()
+                };
+
+                // This task does no I/O: it only listens for a barrier's completion and then
+                // sends to the command response channel.  It is therefore safe to spawn this without
+                // any gates/task_mgr hooks.
+                tokio::task::spawn(async move {
+                    barrier.wait().await;
+
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Heatmap upload complete");
+
+                    // Drop result of send: we don't care if caller dropped their receiver
+                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
+                });
+            }
+        }
+    }
+}
+
+enum UploadHeatmapOutcome {
+    /// We successfully wrote to remote storage, with this digest.
+    Uploaded(md5::Digest),
+    /// We did not upload because the heatmap digest was unchanged since the last upload
+    NoChange,
+    /// We skipped the upload for some reason, such as tenant/timeline not ready
+    Skipped,
+}
+
+#[derive(thiserror::Error, Debug)]
+enum UploadHeatmapError {
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Upload(#[from] anyhow::Error),
+}
+
+/// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
+/// of the object we would have uploaded.
+#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
+async fn upload_tenant_heatmap(
+    remote_storage: GenericRemoteStorage,
+    tenant: &Arc<Tenant>,
+    last_digest: Option<md5::Digest>,
+) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
+    debug_assert_current_span_has_tenant_id();
+
+    let generation = tenant.get_generation();
+    if generation.is_none() {
+        // We do not expect this: generations were implemented before heatmap uploads.  However,
+        // handle it so that we don't have to make the generation in the heatmap an Option<>
+        // (Generation::none is not serializable)
+        tracing::warn!("Skipping heatmap upload for tenant with generation==None");
+        return Ok(UploadHeatmapOutcome::Skipped);
+    }
+
+    let mut heatmap = HeatMapTenant {
+        timelines: Vec::new(),
+        generation,
+    };
+    let timelines = tenant.timelines.lock().unwrap().clone();
+
+    let tenant_cancel = tenant.cancel.clone();
+
+    // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
+    // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
+    // in remote storage.
+    let _guard = match tenant.gate.enter() {
+        Ok(g) => g,
+        Err(_) => {
+            tracing::info!("Skipping heatmap upload for tenant which is shutting down");
+            return Err(UploadHeatmapError::Cancelled);
+        }
+    };
+
+    for (timeline_id, timeline) in timelines {
+        let heatmap_timeline = timeline.generate_heatmap().await;
+        match heatmap_timeline {
+            None => {
+                tracing::debug!(
+                    "Skipping heatmap upload because timeline {timeline_id} is not ready"
+                );
+                return Ok(UploadHeatmapOutcome::Skipped);
+            }
+            Some(heatmap_timeline) => {
+                heatmap.timelines.push(heatmap_timeline);
+            }
+        }
+    }
+
+    // Serialize the heatmap
+    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
+    let size = bytes.len();
+
+    // Drop out early if nothing changed since our last upload
+    let digest = md5::compute(&bytes);
+    if Some(digest) == last_digest {
+        return Ok(UploadHeatmapOutcome::NoChange);
+    }
+
+    let path = remote_heatmap_path(tenant.get_tenant_shard_id());
+
+    // Write the heatmap.
+    tracing::debug!("Uploading {size} byte heatmap to {path}");
+    if let Err(e) = backoff::retry(
+        || async {
+            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
+                bytes.clone(),
+            ))));
+            remote_storage
+                .upload_storage_object(bytes, size, &path)
+                .await
+        },
+        |_| false,
+        3,
+        u32::MAX,
+        "Uploading heatmap",
+        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
+    )
+    .await
+    {
+        if tenant_cancel.is_cancelled() {
+            return Err(UploadHeatmapError::Cancelled);
+        } else {
+            return Err(e.into());
+        }
+    }
+
+    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
+
+    Ok(UploadHeatmapOutcome::Uploaded(digest))
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d0241eafd5..466c92d32b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -98,8 +98,9 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
 use super::remote_timeline_client::RemoteTimelineClient;
+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -2055,6 +2056,55 @@ impl Timeline {
 
         None
     }
+
+    /// The timeline heatmap is a hint to secondary locations from the primary location,
+    /// indicating which layers are currently on-disk on the primary.
+    ///
+    /// None is returned if the Timeline is in a state where uploading a heatmap
+    /// doesn't make sense, such as shutting down or initializing.  The caller
+    /// should treat this as a cue to simply skip doing any heatmap uploading
+    /// for this timeline.
+    pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
+        let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
+
+        let remote_client = match &self.remote_client {
+            Some(c) => c,
+            None => return None,
+        };
+
+        let layer_file_names = eviction_info
+            .resident_layers
+            .iter()
+            .map(|l| l.layer.layer_desc().filename())
+            .collect::<Vec<_>>();
+
+        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
+            Ok(d) => d,
+            Err(_) => {
+                // Getting metadata only fails on Timeline in bad state.
+                return None;
+            }
+        };
+
+        let heatmap_layers = std::iter::zip(
+            eviction_info.resident_layers.into_iter(),
+            decorated.into_iter(),
+        )
+        .filter_map(|(layer, remote_info)| {
+            remote_info.map(|remote_info| {
+                HeatMapLayer::new(
+                    layer.layer.layer_desc().filename(),
+                    IndexLayerMetadata::from(remote_info),
+                    layer.last_activity_ts,
+                )
+            })
+        });
+
+        Some(HeatMapTimeline::new(
+            self.timeline_id,
+            heatmap_layers.collect(),
+        ))
+    }
 }
 
 type TraversalId = String;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b46ddf5527..eda8813c36 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -322,6 +322,10 @@ class PageserverHttpClient(requests.Session):
         self.verbose_error(res)
         return TenantConfig.from_json(res.json())
 
+    def tenant_heatmap_upload(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
+        self.verbose_error(res)
+
     def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 824531bea4..c0c2383feb 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -16,6 +16,7 @@ from fixtures.log_helper import log
 from fixtures.types import TenantId, TimelineId
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
+TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
 
 
 @enum.unique
@@ -133,6 +134,13 @@ class LocalFsStorage:
         with self.index_path(tenant_id, timeline_id).open("r") as f:
             return json.load(f)
 
+    def heatmap_path(self, tenant_id: TenantId) -> Path:
+        return self.tenant_path(tenant_id) / TENANT_HEATMAP_FILE_NAME
+
+    def heatmap_content(self, tenant_id):
+        with self.heatmap_path(tenant_id).open("r") as f:
+            return json.load(f)
+
     def to_toml_inline_table(self) -> str:
         rv = {
             "local_path": str(self.root),
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 70d386a566..d2bd53d8aa 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -163,6 +163,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_feedback": True,
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
+        "heatmap_period": "10m",
         "image_creation_threshold": 7,
         "pitr_interval": "1m",
         "lagging_wal_timeout": "23m",
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index b14b7f1328..64ade346aa 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -330,3 +330,46 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
 
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
+
+
+def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the sequence of location states that are used in a live migration.
+    """
+    env = neon_env_builder.init_start()  # initial_tenant_conf=TENANT_CONF)
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Write some data so that we have some layers
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+
+    # Write some layers and upload a heatmap
+    workload.write_rows(256, env.pageservers[0].id)
+    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
+
+    def validate_heatmap(heatmap):
+        assert len(heatmap["timelines"]) == 1
+        assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
+        assert len(heatmap["timelines"][0]["layers"]) > 0
+        layers = heatmap["timelines"][0]["layers"]
+
+        # Each layer appears at most once
+        assert len(set(layer["name"] for layer in layers)) == len(layers)
+
+    # Download and inspect the heatmap that the pageserver uploaded
+    heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    log.info(f"Read back heatmap: {heatmap_first}")
+    validate_heatmap(heatmap_first)
+
+    # Do some more I/O to generate more layers
+    workload.churn_rows(64, env.pageservers[0].id)
+    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Ensure that another heatmap upload includes the new layers
+    heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    log.info(f"Read back heatmap: {heatmap_second}")
+    assert heatmap_second != heatmap_first
+    validate_heatmap(heatmap_second)

From aa5581d14f6bad1a878024faecb366ec369cc5d5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 14 Dec 2023 16:31:41 +0100
Subject: [PATCH 096/202] utils::logging: TracingEventCountLayer: don't use
 with_label_values() on hot path (#6129)

fixes #6126
---
 libs/utils/src/logging.rs | 61 ++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 2f09c2f3ea..f7b73dc984 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;
 
 use anyhow::Context;
+use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};
 
@@ -24,16 +25,48 @@ impl LogFormat {
     }
 }
 
-static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-    metrics::register_int_counter_vec!(
+struct TracingEventCountMetric {
+    error: IntCounter,
+    warn: IntCounter,
+    info: IntCounter,
+    debug: IntCounter,
+    trace: IntCounter,
+}
+
+static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+    let vec = metrics::register_int_counter_vec!(
         "libmetrics_tracing_event_count",
         "Number of tracing events, by level",
         &["level"]
     )
-    .expect("failed to define metric")
+    .expect("failed to define metric");
+    TracingEventCountMetric::new(vec)
 });
 
-struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+impl TracingEventCountMetric {
+    fn new(vec: IntCounterVec) -> Self {
+        Self {
+            error: vec.with_label_values(&["error"]),
+            warn: vec.with_label_values(&["warn"]),
+            info: vec.with_label_values(&["info"]),
+            debug: vec.with_label_values(&["debug"]),
+            trace: vec.with_label_values(&["trace"]),
+        }
+    }
+
+    fn inc_for_level(&self, level: tracing::Level) {
+        let counter = match level {
+            tracing::Level::ERROR => &self.error,
+            tracing::Level::WARN => &self.warn,
+            tracing::Level::INFO => &self.info,
+            tracing::Level::DEBUG => &self.debug,
+            tracing::Level::TRACE => &self.trace,
+        };
+        counter.inc();
+    }
+}
+
+struct TracingEventCountLayer(&'static TracingEventCountMetric);
 
 impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
 where
@@ -44,15 +77,7 @@ where
         event: &tracing::Event<'_>,
         _ctx: tracing_subscriber::layer::Context<'_, S>,
     ) {
-        let level = event.metadata().level();
-        let level = match *level {
-            tracing::Level::ERROR => "error",
-            tracing::Level::WARN => "warn",
-            tracing::Level::INFO => "info",
-            tracing::Level::DEBUG => "debug",
-            tracing::Level::TRACE => "trace",
-        };
-        self.0.with_label_values(&[level]).inc();
+        self.0.inc_for_level(*event.metadata().level());
     }
 }
 
@@ -106,7 +131,9 @@ pub fn init(
         };
         log_layer.with_filter(rust_log_env_filter())
     });
-    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    let r = r.with(
+        TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
+    );
     match tracing_error_layer_enablement {
         TracingErrorLayerEnablement::EnableWithRustLogFilter => r
             .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -257,14 +284,14 @@ impl std::fmt::Debug for SecretString {
 mod tests {
     use metrics::{core::Opts, IntCounterVec};
 
-    use super::TracingEventCountLayer;
+    use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
 
     #[test]
     fn tracing_event_count_metric() {
         let counter_vec =
             IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
-        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
-        let layer = TracingEventCountLayer(counter_vec);
+        let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
+        let layer = TracingEventCountLayer(metric);
         use tracing_subscriber::prelude::*;
 
         tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {

From cc633585dca4c98e028ba37acf578f7c8cd17c99 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 14 Dec 2023 17:21:39 +0000
Subject: [PATCH 097/202] gauge guards (#6138)

## Problem

The websockets gauge for active db connections seems to be growing more
than the gauge for client connections over websockets, which does not
make sense.

## Summary of changes

refactor how our counter-pair gauges are represented. not sure if this
will improve the problem, but it should be harder to mess-up the
counters. The API is much nicer though now and doesn't require
scopeguard::defer hacks
---
 libs/metrics/src/lib.rs               | 139 +++++++++++++++++++++++++-
 pageserver/src/metrics.rs             |  36 +++----
 pageserver/src/tenant/tasks.rs        |   8 +-
 proxy/src/compute.rs                  |  12 ++-
 proxy/src/proxy.rs                    |  65 ++++--------
 proxy/src/serverless.rs               |  22 ++--
 proxy/src/serverless/conn_pool.rs     |  15 ++-
 proxy/src/serverless/sql_over_http.rs |   9 +-
 safekeeper/src/handler.rs             |   7 +-
 safekeeper/src/metrics.rs             |  13 +--
 10 files changed, 209 insertions(+), 117 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index ed375a152f..d09ba11344 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -3,8 +3,11 @@
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
+
 use once_cell::sync::Lazy;
-use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
+use prometheus::core::{
+    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
+};
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
@@ -132,3 +135,137 @@ fn get_rusage_stats() -> libc::rusage {
         rusage.assume_init()
     }
 }
+
+/// Create an [`IntCounterPairVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_int_counter_pair_vec {
+    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
+        match (
+            $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
+            $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
+        ) {
+            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
+            (Err(e), _) | (_, Err(e)) => Err(e),
+        }
+    }};
+}
+/// Create an [`IntCounterPair`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_int_counter_pair {
+    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
+        match (
+            $crate::register_int_counter!($NAME1, $HELP1),
+            $crate::register_int_counter!($NAME2, $HELP2),
+        ) {
+            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
+            (Err(e), _) | (_, Err(e)) => Err(e),
+        }
+    }};
+}
+
+/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
+pub struct GenericCounterPairVec<P: Atomic> {
+    inc: GenericCounterVec<P>,
+    dec: GenericCounterVec<P>,
+}
+
+/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
+pub struct GenericCounterPair<P: Atomic> {
+    inc: GenericCounter<P>,
+    dec: GenericCounter<P>,
+}
+
+impl<P: Atomic> GenericCounterPairVec<P> {
+    pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
+        Self { inc, dec }
+    }
+
+    /// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+        Ok(GenericCounterPair {
+            inc: self.inc.get_metric_with_label_values(vals)?,
+            dec: self.dec.get_metric_with_label_values(vals)?,
+        })
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<P: Atomic> GenericCounterPair<P> {
+    pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
+        Self { inc, dec }
+    }
+
+    /// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
+    pub fn guard(&self) -> GenericCounterPairGuard<P> {
+        self.inc.inc();
+        GenericCounterPairGuard(self.dec.clone())
+    }
+
+    /// Increment the gauge by n, returning a guard that decrements by n on drop.
+    pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
+        self.inc.inc_by(n);
+        GenericCounterPairGuardBy(self.dec.clone(), n)
+    }
+
+    /// Increase the gauge by 1.
+    #[inline]
+    pub fn inc(&self) {
+        self.inc.inc();
+    }
+
+    /// Decrease the gauge by 1.
+    #[inline]
+    pub fn dec(&self) {
+        self.dec.inc();
+    }
+
+    /// Add the given value to the gauge. (The value can be
+    /// negative, resulting in a decrement of the gauge.)
+    #[inline]
+    pub fn inc_by(&self, v: P::T) {
+        self.inc.inc_by(v);
+    }
+
+    /// Subtract the given value from the gauge. (The value can be
+    /// negative, resulting in an increment of the gauge.)
+    #[inline]
+    pub fn dec_by(&self, v: P::T) {
+        self.dec.inc_by(v);
+    }
+}
+
+/// Guard returned by [`GenericCounterPair::guard`]
+pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
+
+impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
+    fn drop(&mut self) {
+        self.0.inc();
+    }
+}
+/// Guard returned by [`GenericCounterPair::guard_by`]
+pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
+
+impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
+    fn drop(&mut self) {
+        self.0.inc_by(self.1);
+    }
+}
+
+/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
+pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
+
+/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
+pub type IntCounterPair = GenericCounterPair<AtomicU64>;
+
+/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
+pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index eefe295f94..ba6fd00bd1 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2,9 +2,10 @@ use enum_map::EnumMap;
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
-    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
-    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
-    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
+    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
@@ -1343,25 +1344,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            &["task"],
-        )
-        .unwrap()
-    });
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap()
-    });
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_background_loop_semaphore_wait_start_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls started",
+        "pageserver_background_loop_semaphore_wait_finish_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+        &["task"],
+    )
+    .unwrap()
+});
 
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index dc23030218..4b118442f4 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -63,12 +63,10 @@ pub(crate) async fn concurrent_background_tasks_rate_limit(
     _ctx: &RequestContext,
     cancel: &CancellationToken,
 ) -> Result<impl Drop, RateLimitError> {
-    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
         .with_label_values(&[loop_kind.as_static_str()])
-        .inc();
-    scopeguard::defer!(
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
-    );
+        .guard();
+
     tokio::select! {
         permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
             match permit {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 78c56300a5..f5f7270bf4 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,9 +1,13 @@
 use crate::{
-    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    error::UserFacingError, proxy::neon_option,
+    auth::parse_endpoint_param,
+    cancellation::CancelClosure,
+    console::errors::WakeComputeError,
+    error::UserFacingError,
+    proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
+use metrics::IntCounterPairGuard;
 use pq_proto::StartupMessageParams;
 use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
@@ -223,6 +227,8 @@ pub struct PostgresConnection {
     pub params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
     pub cancel_closure: CancelClosure,
+
+    _guage: IntCounterPairGuard,
 }
 
 impl ConnCfg {
@@ -231,6 +237,7 @@ impl ConnCfg {
         &self,
         allow_self_signed_compute: bool,
         timeout: Duration,
+        proto: &'static str,
     ) -> Result<PostgresConnection, ConnectionError> {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
 
@@ -264,6 +271,7 @@ impl ConnCfg {
             stream,
             params,
             cancel_closure,
+            _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
         };
 
         Ok(connection)
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 396db8f96a..da65065179 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,10 @@ use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
+use metrics::{
+    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
+    IntCounterPairVec, IntCounterVec,
+};
 use once_cell::sync::{Lazy, OnceCell};
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use prometheus::{
@@ -44,17 +47,10 @@ const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
 
-pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
         "proxy_opened_db_connections_total",
         "Number of opened connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
         "proxy_closed_db_connections_total",
         "Number of closed connections to a database.",
         &["protocol"],
@@ -62,17 +58,10 @@ pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(||
     .unwrap()
 });
 
-pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
         "proxy_opened_client_connections_total",
         "Number of opened connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
         "proxy_closed_client_connections_total",
         "Number of closed connections from a client.",
         &["protocol"],
@@ -80,17 +69,10 @@ pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new
     .unwrap()
 });
 
-pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
         "proxy_accepted_connections_total",
         "Number of client connections accepted.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
         "proxy_closed_connections_total",
         "Number of client connections closed.",
         &["protocol"],
@@ -428,16 +410,12 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     );
 
     let proto = mode.protocol_label();
-    NUM_CLIENT_CONNECTION_OPENED_COUNTER
+    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
         .with_label_values(&[proto])
-        .inc();
-    NUM_CONNECTIONS_ACCEPTED_COUNTER
+        .guard();
+    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[proto])
-        .inc();
-    scopeguard::defer! {
-        NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-    }
+        .guard();
 
     let tls = config.tls_config.as_ref();
 
@@ -584,12 +562,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
     timeout: time::Duration,
+    proto: &'static str,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let allow_self_signed_compute = node_info.allow_self_signed_compute;
 
     node_info
         .config
-        .connect(allow_self_signed_compute, timeout)
+        .connect(allow_self_signed_compute, timeout, proto)
         .await
 }
 
@@ -610,6 +589,7 @@ pub trait ConnectMechanism {
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
+    pub proto: &'static str,
 }
 
 #[async_trait]
@@ -623,7 +603,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(node_info, timeout).await
+        connect_to_compute_once(node_info, timeout, self.proto).await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -1028,7 +1008,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
 
         let aux = node_info.aux.clone();
         let mut node = connect_to_compute(
-            &TcpMechanism { params },
+            &TcpMechanism { params, proto },
             node_info,
             &extra,
             &creds,
@@ -1037,13 +1017,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         .or_else(|e| stream.throw_error(e))
         .await?;
 
-        NUM_DB_CONNECTIONS_OPENED_COUNTER
-            .with_label_values(&[proto])
-            .inc();
-        scopeguard::defer! {
-            NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-        }
-
         prepare_client_connection(&node, session, &mut stream).await?;
         // Before proxy passing, forward to compute whatever data is left in the
         // PqStream input buffer. Normally there is none, but our serverless npm
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index cdff42b529..870e9c1103 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -8,12 +8,13 @@ mod websocket;
 
 use anyhow::bail;
 use hyper::StatusCode;
+use metrics::IntCounterPairGuard;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
-use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
+use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
@@ -149,22 +150,17 @@ pub async fn task_main(
 
 struct MetricService<S> {
     inner: S,
+    _gauge: IntCounterPairGuard,
 }
 
 impl<S> MetricService<S> {
     fn new(inner: S) -> MetricService<S> {
-        NUM_CLIENT_CONNECTION_OPENED_COUNTER
-            .with_label_values(&["http"])
-            .inc();
-        MetricService { inner }
-    }
-}
-
-impl<S> Drop for MetricService<S> {
-    fn drop(&mut self) {
-        NUM_CLIENT_CONNECTION_CLOSED_COUNTER
-            .with_label_values(&["http"])
-            .inc();
+        MetricService {
+            inner,
+            _gauge: NUM_CLIENT_CONNECTION_GAUGE
+                .with_label_values(&["http"])
+                .guard(),
+        }
     }
 }
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 4f3b31b9be..69198d79d3 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -24,10 +24,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 use crate::{
     auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
     console,
-    proxy::{
-        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
-        NUM_DB_CONNECTIONS_OPENED_COUNTER,
-    },
+    proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
@@ -477,6 +474,11 @@ async fn connect_to_compute_once(
         .connect_timeout(timeout)
         .connect(tokio_postgres::NoTls)
         .await?;
+
+    let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
+        .with_label_values(&["http"])
+        .guard();
+
     tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
 
     let (tx, mut rx) = tokio::sync::watch::channel(session);
@@ -492,10 +494,7 @@ async fn connect_to_compute_once(
 
     tokio::spawn(
         async move {
-            NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc();
-            scopeguard::defer! {
-                NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
-            }
+            let _conn_gauge = conn_gauge;
             poll_fn(move |cx| {
                 if matches!(rx.has_changed(), Ok(true)) {
                     session = *rx.borrow_and_update();
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6e80260193..795ba819c1 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -29,7 +29,7 @@ use utils::http::error::ApiError;
 use utils::http::json::json_response;
 
 use crate::config::HttpConfig;
-use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER};
+use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
@@ -303,12 +303,9 @@ async fn handle_inner(
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
 ) -> anyhow::Result<Response<Body>> {
-    NUM_CONNECTIONS_ACCEPTED_COUNTER
+    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&["http"])
-        .inc();
-    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
-    }
+        .guard();
 
     //
     // Determine the destination and connection params
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index d5333abae6..761541168c 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -11,7 +11,7 @@ use tracing::{debug, info, info_span, Instrument};
 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
 
-use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE};
 use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
@@ -210,10 +210,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
         let cmd = parse_cmd(query_string)?;
         let cmd_str = cmd_to_string(&cmd);
 
-        PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc();
-        scopeguard::defer! {
-            PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc();
-        }
+        let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
 
         info!("got query {:?}", query_string);
 
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 0711beb290..11a3f48922 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -11,7 +11,8 @@ use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec,
+    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
+    IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -89,16 +90,10 @@ pub static BROKER_PULLED_UPDATES: Lazy<IntCounterVec> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_broker_pulled_updates_total counter")
 });
-pub static PG_QUERIES_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+pub static PG_QUERIES_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
         "safekeeper_pg_queries_received_total",
         "Number of queries received through pg protocol",
-        &["query"]
-    )
-    .expect("Failed to register safekeeper_pg_queries_received_total counter")
-});
-pub static PG_QUERIES_FINISHED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
         "safekeeper_pg_queries_finished_total",
         "Number of queries finished through pg protocol",
         &["query"]

From f0104791070fa9e4b110bb95dbca0866cd593d36 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 14 Dec 2023 21:32:54 +0200
Subject: [PATCH 098/202] feat(layer): pageserver_layer_redownloaded_after
 histogram (#6132)

this is aimed at replacing the current mtime only based trashing
alerting later.

Cc: #5331
---
 pageserver/src/tenant/storage_layer/layer.rs | 39 ++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 112128ead8..69a2893456 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -457,6 +457,8 @@ struct LayerInner {
     /// For loaded layers, this may be some other value if the tenant has undergone
     /// a shard split since the layer was originally written.
     shard: ShardIndex,
+
+    last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
 }
 
 impl std::fmt::Display for LayerInner {
@@ -587,6 +589,7 @@ impl LayerInner {
             consecutive_failures: AtomicUsize::new(0),
             generation,
             shard,
+            last_evicted_at: std::sync::Mutex::default(),
         }
     }
 
@@ -722,6 +725,14 @@ impl LayerInner {
                     permit
                 };
 
+                let since_last_eviction =
+                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                if let Some(since_last_eviction) = since_last_eviction {
+                    // FIXME: this will not always be recorded correctly until #6028 (the no
+                    // download needed branch above)
+                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                }
+
                 let res = Arc::new(DownloadedLayer {
                     owner: Arc::downgrade(self),
                     kind: tokio::sync::OnceCell::default(),
@@ -1117,6 +1128,8 @@ impl LayerInner {
         // we are still holding the permit, so no new spawn_download_and_wait can happen
         drop(self.status.send(Status::Evicted));
 
+        *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
+
         res
     }
 
@@ -1421,6 +1434,7 @@ pub(crate) struct LayerImplMetrics {
 
     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
+    redownload_after: metrics::Histogram,
 }
 
 impl Default for LayerImplMetrics {
@@ -1496,6 +1510,26 @@ impl Default for LayerImplMetrics {
         )
         .unwrap();
 
+        let redownload_after = {
+            let minute = 60.0;
+            let hour = 60.0 * minute;
+            metrics::register_histogram!(
+                "pageserver_layer_redownloaded_after",
+                "Time between evicting and re-downloading.",
+                vec![
+                    10.0,
+                    30.0,
+                    minute,
+                    5.0 * minute,
+                    15.0 * minute,
+                    30.0 * minute,
+                    hour,
+                    12.0 * hour,
+                ]
+            )
+            .unwrap()
+        };
+
         Self {
             started_evictions,
             completed_evictions,
@@ -1507,6 +1541,7 @@ impl Default for LayerImplMetrics {
 
             rare_counters,
             inits_cancelled,
+            redownload_after,
         }
     }
 }
@@ -1574,6 +1609,10 @@ impl LayerImplMetrics {
     fn inc_init_cancelled(&self) {
         self.inits_cancelled.inc()
     }
+
+    fn record_redownloaded_after(&self, duration: std::time::Duration) {
+        self.redownload_after.observe(duration.as_secs_f64())
+    }
 }
 
 #[derive(enum_map::Enum)]

From f1cd1a212224742c0f97e3b1bc7f66422725f67c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 08:51:23 +0000
Subject: [PATCH 099/202] pageserver: improved handling of concurrent timeline
 creations on the same ID (#6139)

## Problem

Historically, the pageserver used an "uninit mark" file on disk for two
purposes:
- Track which timeline dirs are incomplete for handling on restart
- Avoid trying to create the same timeline twice at the same time.

The original purpose of handling restarts is now defunct, as we use
remote storage as the source of truth and clean up any trash timeline
dirs on startup. Using the file to mutually exclude creation operations
is error prone compared with just doing it in memory, and the existing
checks happened some way into the creation operation, and could expose
errors as 500s (anyhow::Errors) rather than something clean.

## Summary of changes

- Creations are now mutually excluded in memory (using
`Tenant::timelines_creating`), rather than relying on a file on disk for
coordination.
- Acquiring unique access to the timeline ID now happens earlier in the
request.
- Creating the same timeline which already exists is now a 201: this
simplifies retry handling for clients.
- 409 is still returned if a timeline with the same ID is still being
created: if this happens it is probably because the client timed out an
earlier request and has retried.
- Colliding timeline creation requests should no longer return 500
errors

This paves the way to entirely removing uninit markers in a subsequent
change.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/http/openapi_spec.yml     |   4 +-
 pageserver/src/http/routes.rs            |   2 +-
 pageserver/src/tenant.rs                 | 221 +++++++++++++++--------
 pageserver/src/tenant/timeline.rs        |   6 +
 pageserver/src/tenant/timeline/uninit.rs |  63 ++++++-
 pageserver/src/walingest.rs              |   2 +-
 test_runner/regress/test_branching.py    |  89 +--------
 7 files changed, 207 insertions(+), 180 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index fbb059c17b..b79c5ada9a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -992,8 +992,8 @@ paths:
           type: string
     post:
       description: |
-        Create a timeline. Returns new timeline id on success.\
-        If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
+        Create a timeline. Returns new timeline id on success.
+        Recreating the same timeline will succeed if the parameters match the existing timeline.
         If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
       requestBody:
         content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 618d01dd6a..14107557d9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -453,7 +453,7 @@ async fn timeline_create_handler(
                     .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
-            Err(tenant::CreateTimelineError::AlreadyExists) => {
+            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                 json_response(StatusCode::CONFLICT, ())
             }
             Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 89d332f178..969210622c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,6 +48,7 @@ use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
@@ -87,7 +88,6 @@ use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::MutexGuard;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
@@ -249,6 +249,12 @@ pub struct Tenant {
     generation: Generation,
 
     timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
+
+    /// During timeline creation, we first insert the TimelineId to the
+    /// creating map, then `timelines`, then remove it from the creating map.
+    /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating`
+    timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
+
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
     // `timelines` mutex during all GC iteration
@@ -407,8 +413,10 @@ impl Debug for SetStoppingError {
 
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
-    #[error("a timeline with the given ID already exists")]
-    AlreadyExists,
+    #[error("creation of timeline with the given ID is in progress")]
+    AlreadyCreating,
+    #[error("timeline already exists with different parameters")]
+    Conflict,
     #[error(transparent)]
     AncestorLsn(anyhow::Error),
     #[error("ancestor timeline is not active")]
@@ -1458,7 +1466,7 @@ impl Tenant {
     /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
     /// minimum amount of keys required to get a writable timeline.
     /// (Without it, `put` might fail due to `repartition` failing.)
-    pub async fn create_empty_timeline(
+    pub(crate) async fn create_empty_timeline(
         &self,
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
@@ -1470,10 +1478,7 @@ impl Tenant {
             "Cannot create empty timelines on inactive tenant"
         );
 
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
-        };
+        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
         let new_metadata = TimelineMetadata::new(
             // Initialize disk_consistent LSN to 0, The caller must import some data to
             // make it valid, before calling finish_creation()
@@ -1550,7 +1555,7 @@ impl Tenant {
     /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
     /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
     #[allow(clippy::too_many_arguments)]
-    pub async fn create_timeline(
+    pub(crate) async fn create_timeline(
         &self,
         new_timeline_id: TimelineId,
         ancestor_timeline_id: Option<TimelineId>,
@@ -1571,26 +1576,51 @@ impl Tenant {
             .enter()
             .map_err(|_| CreateTimelineError::ShuttingDown)?;
 
-        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
-            debug!("timeline {new_timeline_id} already exists");
-
-            if let Some(remote_client) = existing.remote_client.as_ref() {
-                // Wait for uploads to complete, so that when we return Ok, the timeline
-                // is known to be durable on remote storage. Just like we do at the end of
-                // this function, after we have created the timeline ourselves.
-                //
-                // We only really care that the initial version of `index_part.json` has
-                // been uploaded. That's enough to remember that the timeline
-                // exists. However, there is no function to wait specifically for that so
-                // we just wait for all in-progress uploads to finish.
-                remote_client
-                    .wait_completion()
-                    .await
-                    .context("wait for timeline uploads to complete")?;
+        // Get exclusive access to the timeline ID: this ensures that it does not already exist,
+        // and that no other creation attempts will be allowed in while we are working.  The
+        // uninit_mark is a guard.
+        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
+            Ok(m) => m,
+            Err(TimelineExclusionError::AlreadyCreating) => {
+                // Creation is in progress, we cannot create it again, and we cannot
+                // check if this request matches the existing one, so caller must try
+                // again later.
+                return Err(CreateTimelineError::AlreadyCreating);
             }
+            Err(TimelineExclusionError::Other(e)) => {
+                return Err(CreateTimelineError::Other(e));
+            }
+            Err(TimelineExclusionError::AlreadyExists(existing)) => {
+                debug!("timeline {new_timeline_id} already exists");
 
-            return Err(CreateTimelineError::AlreadyExists);
-        }
+                // Idempotency: creating the same timeline twice is not an error, unless
+                // the second creation has different parameters.
+                if existing.get_ancestor_timeline_id() != ancestor_timeline_id
+                    || existing.pg_version != pg_version
+                    || (ancestor_start_lsn.is_some()
+                        && ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
+                {
+                    return Err(CreateTimelineError::Conflict);
+                }
+
+                if let Some(remote_client) = existing.remote_client.as_ref() {
+                    // Wait for uploads to complete, so that when we return Ok, the timeline
+                    // is known to be durable on remote storage. Just like we do at the end of
+                    // this function, after we have created the timeline ourselves.
+                    //
+                    // We only really care that the initial version of `index_part.json` has
+                    // been uploaded. That's enough to remember that the timeline
+                    // exists. However, there is no function to wait specifically for that so
+                    // we just wait for all in-progress uploads to finish.
+                    remote_client
+                        .wait_completion()
+                        .await
+                        .context("wait for timeline uploads to complete")?;
+                }
+
+                return Ok(existing);
+            }
+        };
 
         let loaded_timeline = match ancestor_timeline_id {
             Some(ancestor_timeline_id) => {
@@ -1627,18 +1657,32 @@ impl Tenant {
                     ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                 }
 
-                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
-                    .await?
+                self.branch_timeline(
+                    &ancestor_timeline,
+                    new_timeline_id,
+                    ancestor_start_lsn,
+                    uninit_mark,
+                    ctx,
+                )
+                .await?
             }
             None => {
-                self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
-                    .await?
+                self.bootstrap_timeline(
+                    new_timeline_id,
+                    pg_version,
+                    load_existing_initdb,
+                    uninit_mark,
+                    ctx,
+                )
+                .await?
             }
         };
 
+        // At this point we have dropped our guard on [`Self::timelines_creating`], and
+        // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
+        // not send a success to the caller until it is.  The same applies to handling retries,
+        // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
-            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
-            // Ok, the timeline is durable in remote storage.
             let kind = ancestor_timeline_id
                 .map(|_| "branched")
                 .unwrap_or("bootstrapped");
@@ -2422,6 +2466,7 @@ impl Tenant {
             loading_started_at: Instant::now(),
             tenant_conf: Arc::new(RwLock::new(attached_conf)),
             timelines: Mutex::new(HashMap::new()),
+            timelines_creating: Mutex::new(HashSet::new()),
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
@@ -2813,8 +2858,9 @@ impl Tenant {
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
+        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -2828,9 +2874,10 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
             .await
     }
 
@@ -2839,13 +2886,14 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
         _ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
 
-        // First acquire the GC lock so that another task cannot advance the GC
-        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
-        // creating the branch.
+        // We will validate our ancestor LSN in this function.  Acquire the GC lock so that
+        // this check cannot race with GC, and the ancestor LSN is guaranteed to remain
+        // valid while we are creating the branch.
         let _gc_cs = self.gc_cs.lock().await;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2855,13 +2903,6 @@ impl Tenant {
             lsn
         });
 
-        // Create a placeholder for the new branch. This will error
-        // out if the new timeline ID is already in use.
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(dst_id, &timelines)?
-        };
-
         // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
         // horizon on the source timeline
         //
@@ -2953,21 +2994,38 @@ impl Tenant {
         Ok(new_timeline)
     }
 
-    /// - run initdb to init temporary instance and get bootstrap data
-    /// - after initialization completes, tar up the temp dir and upload it to S3.
-    ///
-    /// The caller is responsible for activating the returned timeline.
-    pub(crate) async fn bootstrap_timeline(
+    /// For unit tests, make this visible so that other modules can directly create timelines
+    #[cfg(test)]
+    pub(crate) async fn bootstrap_timeline_test(
         &self,
         timeline_id: TimelineId,
         pg_version: u32,
         load_existing_initdb: Option<TimelineId>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(timeline_id, &timelines)?
-        };
+        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
+        self.bootstrap_timeline(
+            timeline_id,
+            pg_version,
+            load_existing_initdb,
+            uninit_mark,
+            ctx,
+        )
+        .await
+    }
+
+    /// - run initdb to init temporary instance and get bootstrap data
+    /// - after initialization completes, tar up the temp dir and upload it to S3.
+    ///
+    /// The caller is responsible for activating the returned timeline.
+    async fn bootstrap_timeline(
+        &self,
+        timeline_id: TimelineId,
+        pg_version: u32,
+        load_existing_initdb: Option<TimelineId>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
         // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
         // temporary directory for basebackup files for the given timeline.
 
@@ -3164,11 +3222,11 @@ impl Tenant {
     /// at 'disk_consistent_lsn'. After any initial data has been imported, call
     /// `finish_creation` to insert the Timeline into the timelines map and to remove the
     /// uninit mark file.
-    async fn prepare_new_timeline(
-        &self,
+    async fn prepare_new_timeline<'a>(
+        &'a self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
-        uninit_mark: TimelineUninitMark,
+        uninit_mark: TimelineUninitMark<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
     ) -> anyhow::Result<UninitializedTimeline> {
@@ -3241,23 +3299,38 @@ impl Tenant {
     fn create_timeline_uninit_mark(
         &self,
         timeline_id: TimelineId,
-        timelines: &MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
-    ) -> anyhow::Result<TimelineUninitMark> {
+    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
         let tenant_shard_id = self.tenant_shard_id;
 
-        anyhow::ensure!(
-            timelines.get(&timeline_id).is_none(),
-            "Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory"
-        );
-        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
-        anyhow::ensure!(
-            !timeline_path.exists(),
-            "Timeline {timeline_path} already exists, cannot create its uninit mark file",
-        );
-
         let uninit_mark_path = self
             .conf
             .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
+        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
+
+        let uninit_mark = TimelineUninitMark::new(
+            self,
+            timeline_id,
+            uninit_mark_path.clone(),
+            timeline_path.clone(),
+        )?;
+
+        // At this stage, we have got exclusive access to in-memory state for this timeline ID
+        // for creation.
+        // A timeline directory should never exist on disk already:
+        // - a previous failed creation would have cleaned up after itself
+        // - a pageserver restart would clean up timeline directories that don't have valid remote state
+        //
+        // Therefore it is an unexpected internal error to encounter a timeline directory already existing here,
+        // this error may indicate a bug in cleanup on failed creations.
+        if timeline_path.exists() {
+            return Err(TimelineExclusionError::Other(anyhow::anyhow!(
+                "Timeline directory already exists! This is a bug."
+            )));
+        }
+
+        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
+        // that during process runtime, colliding creations will be caught in-memory without getting
+        // as far as failing to write a file.
         fs::OpenOptions::new()
             .write(true)
             .create_new(true)
@@ -3271,8 +3344,6 @@ impl Tenant {
                 format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
             })?;
 
-        let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path);
-
         Ok(uninit_mark)
     }
 
@@ -4022,13 +4093,7 @@ mod tests {
             .await
         {
             Ok(_) => panic!("duplicate timeline creation should fail"),
-            Err(e) => assert_eq!(
-                e.to_string(),
-                format!(
-                    "Timeline {}/{} already exists in pageserver's memory",
-                    tenant.tenant_shard_id, TIMELINE_ID
-                )
-            ),
+            Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
         }
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 466c92d32b..ac1922ccad 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -446,6 +446,12 @@ pub(crate) enum CompactFlags {
     ForceRepartition,
 }
 
+impl std::fmt::Debug for Timeline {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Timeline<{}>", self.timeline_id)
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 61130f541a..27d6fd9c28 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -19,14 +19,14 @@ use super::Timeline;
 pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
 }
 
 impl<'t> UninitializedTimeline<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
     ) -> Self {
         Self {
             owning_tenant,
@@ -169,18 +169,55 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
 ///
 /// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
 #[must_use]
-pub(crate) struct TimelineUninitMark {
+pub(crate) struct TimelineUninitMark<'t> {
+    owning_tenant: &'t Tenant,
+    timeline_id: TimelineId,
     uninit_mark_deleted: bool,
     uninit_mark_path: Utf8PathBuf,
     pub(crate) timeline_path: Utf8PathBuf,
 }
 
-impl TimelineUninitMark {
-    pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
+/// Errors when acquiring exclusive access to a timeline ID for creation
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum TimelineExclusionError {
+    #[error("Already exists")]
+    AlreadyExists(Arc<Timeline>),
+    #[error("Already creating")]
+    AlreadyCreating,
+
+    // e.g. I/O errors, or some failure deep in postgres initdb
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl<'t> TimelineUninitMark<'t> {
+    pub(crate) fn new(
+        owning_tenant: &'t Tenant,
+        timeline_id: TimelineId,
+        uninit_mark_path: Utf8PathBuf,
+        timeline_path: Utf8PathBuf,
+    ) -> Result<Self, TimelineExclusionError> {
+        // Lock order: this is the only place we take both locks.  During drop() we only
+        // lock creating_timelines
+        let timelines = owning_tenant.timelines.lock().unwrap();
+        let mut creating_timelines: std::sync::MutexGuard<
+            '_,
+            std::collections::HashSet<TimelineId>,
+        > = owning_tenant.timelines_creating.lock().unwrap();
+
+        if let Some(existing) = timelines.get(&timeline_id) {
+            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
+        } else if creating_timelines.contains(&timeline_id) {
+            Err(TimelineExclusionError::AlreadyCreating)
+        } else {
+            creating_timelines.insert(timeline_id);
+            Ok(Self {
+                owning_tenant,
+                timeline_id,
+                uninit_mark_deleted: false,
+                uninit_mark_path,
+                timeline_path,
+            })
         }
     }
 
@@ -207,7 +244,7 @@ impl TimelineUninitMark {
     }
 }
 
-impl Drop for TimelineUninitMark {
+impl Drop for TimelineUninitMark<'_> {
     fn drop(&mut self) {
         if !self.uninit_mark_deleted {
             if self.timeline_path.exists() {
@@ -226,5 +263,11 @@ impl Drop for TimelineUninitMark {
                 }
             }
         }
+
+        self.owning_tenant
+            .timelines_creating
+            .lock()
+            .unwrap()
+            .remove(&self.timeline_id);
     }
 }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 738216afa5..16b245c488 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -2191,7 +2191,7 @@ mod tests {
             .load()
             .await;
         let tline = tenant
-            .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
             .await
             .unwrap();
 
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 82ca985d01..9a0b91b54e 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,8 +1,7 @@
 import random
 import threading
 import time
-from queue import SimpleQueue
-from typing import Any, Dict, List, Union
+from typing import List
 
 import pytest
 from fixtures.log_helper import log
@@ -239,92 +238,6 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
         t.join()
 
 
-def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder):
-    """
-    If the activate only after upload is used, then retries could become competing.
-    """
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
-            ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory",
-        ]
-    )
-    ps_http = env.pageserver.http_client()
-
-    # pause all uploads
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    env.pageserver.tenant_create(env.initial_tenant)
-
-    def start_creating_timeline():
-        ps_http.timeline_create(
-            env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
-        )
-
-    create_root = threading.Thread(target=start_creating_timeline)
-
-    branch_id = TimelineId.generate()
-
-    queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue()
-    barrier = threading.Barrier(3)
-
-    def try_branch():
-        barrier.wait()
-        barrier.wait()
-        try:
-            ret = ps_http.timeline_create(
-                env.pg_version,
-                env.initial_tenant,
-                branch_id,
-                ancestor_timeline_id=env.initial_timeline,
-                timeout=5,
-            )
-            queue.put(ret)
-        except Exception as e:
-            queue.put(e)
-
-    threads = [threading.Thread(target=try_branch) for _ in range(2)]
-
-    try:
-        create_root.start()
-
-        for t in threads:
-            t.start()
-
-        wait_until_paused(env, "before-upload-index-pausable")
-
-        barrier.wait()
-        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
-        barrier.wait()
-
-        # now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files
-        first = queue.get()
-        second = queue.get()
-
-        log.info(first)
-        log.info(second)
-
-        (succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first)
-        assert isinstance(failed, Exception)
-        assert isinstance(succeeded, Dict)
-
-        # there's multiple valid status codes:
-        # - Timeline x/y already exists
-        # - whatever 409 response says, but that is a subclass of PageserverApiException
-        assert isinstance(failed, PageserverApiException)
-        assert succeeded["state"] == "Active"
-    finally:
-        # we might still have the failpoint active
-        env.pageserver.stop(immediate=True)
-
-        for t in threads:
-            t.join()
-        create_root.join()
-
-
 def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
     """
     Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.

From 5bb9ba37cc83460c60cf26edeff027f48f17d7d3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Dec 2023 17:19:36 +0300
Subject: [PATCH 100/202] Fix python list_segments of sk.

Fixes rare test_peer_recovery flakiness as we started to compare tmp control
file.

https://neondb.slack.com/archives/C04KGFVUWUQ/p1702310929657179
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7dfdd9274d..42e122cefe 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2945,7 +2945,7 @@ class Safekeeper:
         tli_dir = self.timeline_dir(tenant_id, timeline_id)
         segments = []
         for _, _, filenames in os.walk(tli_dir):
-            segments.extend([f for f in filenames if f != "safekeeper.control"])
+            segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
         segments.sort()
         return segments
 

From 07508fb1108d7fb503471a433395838fd13567ed Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Dec 2023 12:18:22 +0200
Subject: [PATCH 101/202] fix: better Json parsing errors (#6135)

Before any json parsing from the http api only returned errors were per
field errors. Now they are done using `serde_path_to_error`, which at
least helped greatly with the `disk_usage_eviction_run` used for
testing. I don't think this can conflict with anything added in #5310.
---
 Cargo.lock                    | 1 +
 libs/utils/Cargo.toml         | 2 ++
 libs/utils/src/http/json.rs   | 8 ++++++--
 pageserver/src/http/routes.rs | 4 +---
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 71c2c976a7..7bf9209395 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5765,6 +5765,7 @@ dependencies = [
  "serde",
  "serde_assert",
  "serde_json",
+ "serde_path_to_error",
  "serde_with",
  "signal-hook",
  "strum",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index ccf6f4f2d7..af0414daa2 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -50,6 +50,8 @@ const_format.workspace = true
 # why is it only here? no other crate should use it, streams are rarely needed.
 tokio-stream = { version = "0.1.14" }
 
+serde_path_to_error.workspace = true
+
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs
index 70e682cb76..7ca62561fe 100644
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
     if body.remaining() == 0 {
         return Ok(None);
     }
-    serde_json::from_reader(body.reader())
-        .context("Failed to parse json request")
+
+    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
+
+    serde_path_to_error::deserialize(&mut deser)
+        // intentionally stringify because the debug version is not helpful in python logs
+        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
         .map(Some)
         .map_err(ApiError::BadRequest)
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 14107557d9..102c9d9a6a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1621,9 +1621,7 @@ async fn disk_usage_eviction_run(
         }
     }
 
-    let config = json_request::<Config>(&mut r)
-        .await
-        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+    let config = json_request::<Config>(&mut r).await?;
 
     let usage = Usage {
         config,

From 0fd80484a926ff27275af14c5199d7364b668e56 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Dec 2023 13:59:24 +0200
Subject: [PATCH 102/202] fix: Timeline deletion during busy startup (#6133)

Compaction was holding back timeline deletion because the compaction
lock had been acquired, but the semaphore was waited on. Timeline
deletion was waiting on the same lock for 1500s.

This replaces the
`pageserver::tenant::tasks::concurrent_background_tasks_rate_limit`
(which looks correct) with a simpler `..._permit` which is just an
infallible acquire, which is easier to spot "aah this needs to be raced
with cancellation tokens".

Ref: https://neondb.slack.com/archives/C03F5SM1N02/p1702496912904719
Ref: https://neondb.slack.com/archives/C03F5SM1N02/p1702578093497779
---
 pageserver/src/task_mgr.rs                    | 11 +++-
 pageserver/src/tenant/tasks.rs                | 23 ++------
 pageserver/src/tenant/timeline.rs             | 57 ++++++++++++-------
 .../src/tenant/timeline/eviction_task.rs      | 16 +++---
 4 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index b80a498c82..cb1b2b8011 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -561,9 +561,14 @@ pub async fn shutdown_watcher() {
 /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
 /// `tokio::task::JoinSet::spawn`.
 pub fn shutdown_token() -> CancellationToken {
-    SHUTDOWN_TOKEN
-        .try_with(|t| t.clone())
-        .expect("shutdown_token() called in an unexpected task or thread")
+    let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
+
+    if cfg!(test) {
+        // in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
+        res.unwrap_or_default()
+    } else {
+        res.expect("shutdown_token() called in an unexpected task or thread")
+    }
 }
 
 /// Has the current task been requested to shut down?
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 4b118442f4..7ff1873eda 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -54,29 +54,18 @@ impl BackgroundLoopKind {
     }
 }
 
-pub(crate) enum RateLimitError {
-    Cancelled,
-}
-
-pub(crate) async fn concurrent_background_tasks_rate_limit(
+/// Cancellation safe.
+pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-    cancel: &CancellationToken,
-) -> Result<impl Drop, RateLimitError> {
+) -> impl Drop {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
         .with_label_values(&[loop_kind.as_static_str()])
         .guard();
 
-    tokio::select! {
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
-            match permit {
-                Ok(permit) => Ok(permit),
-                Err(_closed) => unreachable!("we never close the semaphore"),
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(RateLimitError::Cancelled)
-        }
+    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
+        Ok(permit) => permit,
+        Err(_closed) => unreachable!("we never close the semaphore"),
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ac1922ccad..7438215a68 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,7 +51,7 @@ use crate::tenant::storage_layer::{
     LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
     ValueReconstructState,
 };
-use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -715,19 +715,27 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
-        let _g = self.compaction_lock.lock().await;
+        // most likely the cancellation token is from background task, but in tests it could be the
+        // request task as well.
+
+        let prepare = async move {
+            let guard = self.compaction_lock.lock().await;
+
+            let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
+                BackgroundLoopKind::Compaction,
+                ctx,
+            )
+            .await;
+
+            (guard, permit)
+        };
 
         // this wait probably never needs any "long time spent" logging, because we already nag if
         // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Compaction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return Ok(()),
+        let (_guard, _permit) = tokio::select! {
+            tuple = prepare => { tuple },
+            _ = self.cancel.cancelled() => return Ok(()),
+            _ = cancel.cancelled() => return Ok(()),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1782,22 +1790,22 @@ impl Timeline {
             let skip_concurrency_limiter = &skip_concurrency_limiter;
             async move {
                 let cancel = task_mgr::shutdown_token();
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
+                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
                     BackgroundLoopKind::InitialLogicalSizeCalculation,
                     background_ctx,
-                    &cancel,
                 );
 
                 use crate::metrics::initial_logical_size::StartCircumstances;
                 let (_maybe_permit, circumstances) = tokio::select! {
-                    res = wait_for_permit => {
-                        match res {
-                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
-                            Err(RateLimitError::Cancelled) => {
-                                return Err(BackgroundCalculationError::Cancelled);
-                            }
-                        }
+                    permit = wait_for_permit => {
+                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                     }
+                    _ = self_ref.cancel.cancelled() => {
+                        return Err(BackgroundCalculationError::Cancelled);
+                    }
+                    _ = cancel.cancelled() => {
+                        return Err(BackgroundCalculationError::Cancelled);
+                    },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
                         // => break out of the rate limit
@@ -3852,7 +3860,14 @@ impl Timeline {
     /// within a layer file. We can only remove the whole file if it's fully
     /// obsolete.
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
-        let _g = self.gc_lock.lock().await;
+        // this is most likely the background tasks, but it might be the spawned task from
+        // immediate_gc
+        let cancel = crate::task_mgr::shutdown_token();
+        let _g = tokio::select! {
+            guard = self.gc_lock.lock() => guard,
+            _ = self.cancel.cancelled() => return Ok(GcResult::default()),
+            _ = cancel.cancelled() => return Ok(GcResult::default()),
+        };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 
         fail_point!("before-timeline-gc");
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 782e8f9e39..ea5f5f5fa7 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::{BackgroundLoopKind, RateLimitError},
+        tasks::BackgroundLoopKind,
         timeline::EvictionError,
         LogicalSizeCalculationCause, Tenant,
     },
@@ -158,15 +158,15 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        );
+
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
         // If we evict layers but keep cached values derived from those layers, then

From 215cdd18c4032adf915332252fcd4a685b9ca406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 15 Dec 2023 13:11:25 +0100
Subject: [PATCH 103/202] Make initdb upload retries cancellable and seek to
 beginning (#6147)

* initdb uploads had no cancellation token, which means that when we
were stuck in upload retries, we wouldn't be able to delete the
timeline. in general, the combination of retrying forever and not having
cancellation tokens is quite dangerous.
* initdb uploads wouldn't rewind the file. this wasn't discovered in the
purposefully unreliable test-s3 in pytest because those fail on the
first byte always, not somewhere during the connection. we'd be getting
errors from the AWS sdk that the file was at an unexpected end.

slack thread: https://neondb.slack.com/archives/C033RQ5SPDH/p1702632247784079
---
 pageserver/src/tenant.rs                               | 5 +++--
 pageserver/src/tenant/remote_timeline_client/upload.rs | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 969210622c..ace20e44e2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3106,8 +3106,9 @@ impl Tenant {
                     3,
                     u32::MAX,
                     "persist_initdb_tar_zst",
-                    // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-                    backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+                    backoff::Cancel::new(self.cancel.clone(), || {
+                        anyhow::anyhow!("initdb upload cancelled")
+                    }),
                 )
                 .await?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index d0744e7c83..9f5b3c242b 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -4,8 +4,9 @@ use anyhow::{bail, Context};
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
-use std::io::ErrorKind;
+use std::io::{ErrorKind, SeekFrom};
 use tokio::fs::{self, File};
+use tokio::io::AsyncSeekExt;
 
 use super::Generation;
 use crate::{
@@ -119,11 +120,14 @@ pub(crate) async fn upload_initdb_dir(
     storage: &GenericRemoteStorage,
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
-    initdb_tar_zst: File,
+    mut initdb_tar_zst: File,
     size: u64,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading initdb dir");
 
+    // We might have read somewhat into the file already in the prior retry attempt
+    initdb_tar_zst.seek(SeekFrom::Start(0)).await?;
+
     let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);

From 98629841e093eb059e6f75286cba98b4b93bcf6a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 15 Dec 2023 12:11:50 +0000
Subject: [PATCH 104/202] improve proxy code cov (#6141)

## Summary of changes

saw some low-hanging codecov improvements. even if code coverage is
somewhat of a pointless game, might as well add tests where we can and
delete code if it's unused
---
 proxy/src/rate_limiter/aimd.rs    | 33 --------------
 proxy/src/rate_limiter/limiter.rs | 69 +++++++++++++++++++-----------
 proxy/src/serverless/websocket.rs | 71 ++++++++++++++++++++++++++++---
 3 files changed, 107 insertions(+), 66 deletions(-)

diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs
index c6c532ae53..2c14a54a6c 100644
--- a/proxy/src/rate_limiter/aimd.rs
+++ b/proxy/src/rate_limiter/aimd.rs
@@ -33,39 +33,6 @@ impl Aimd {
             min_utilisation_threshold: config.aimd_min_utilisation_threshold,
         }
     }
-
-    pub fn decrease_factor(self, factor: f32) -> Self {
-        assert!((0.5..1.0).contains(&factor));
-        Self {
-            decrease_factor: factor,
-            ..self
-        }
-    }
-
-    pub fn increase_by(self, increase: usize) -> Self {
-        assert!(increase > 0);
-        Self {
-            increase_by: increase,
-            ..self
-        }
-    }
-
-    pub fn with_max_limit(self, max: usize) -> Self {
-        assert!(max > 0);
-        Self {
-            max_limit: max,
-            ..self
-        }
-    }
-
-    /// A threshold below which the limit won't be increased. 0.5 = 50%.
-    pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self {
-        assert!(min_util > 0. && min_util < 1.);
-        Self {
-            min_utilisation_threshold: min_util,
-            ..self
-        }
-    }
 }
 
 #[async_trait]
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 87c1597ca9..8dfdfcd3db 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,12 +1,16 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc,
+use std::{
+    collections::hash_map::RandomState,
+    hash::BuildHasher,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Mutex,
+    },
 };
 
 use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
-use rand::{thread_rng, Rng};
+use rand::{rngs::StdRng, Rng, SeedableRng};
 use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
@@ -28,10 +32,11 @@ use super::{
 // saw SNI, before doing TLS handshake. User-side error messages in that case
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
-pub struct EndpointRateLimiter {
-    map: DashMap<SmolStr, Vec<RateBucket>>,
+pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
+    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
     info: &'static [RateBucketInfo],
     access_count: AtomicUsize,
+    rand: Mutex<Rand>,
 }
 
 #[derive(Clone, Copy)]
@@ -125,11 +130,18 @@ impl RateBucketInfo {
 
 impl EndpointRateLimiter {
     pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
+    }
+}
+
+impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
+    fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
         info!(buckets = ?info, "endpoint rate limiter");
         Self {
             info,
-            map: DashMap::with_shard_amount(64),
+            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
+            rand: Mutex::new(rand),
         }
     }
 
@@ -176,7 +188,9 @@ impl EndpointRateLimiter {
             self.map.len()
         );
         let n = self.map.shards().len();
-        let shard = thread_rng().gen_range(0..n);
+        // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide
+        // (impossible, infact, unless we have 2048 threads)
+        let shard = self.rand.lock().unwrap().gen_range(0..n);
         self.map.shards()[shard].write().clear();
     }
 }
@@ -219,7 +233,6 @@ pub struct Token<'t> {
 #[derive(Debug, Clone, Copy)]
 pub struct LimiterState {
     limit: usize,
-    available: usize,
     in_flight: usize,
 }
 
@@ -397,11 +410,7 @@ impl Limiter {
     pub fn state(&self) -> LimiterState {
         let limit = self.limits.load(Ordering::Relaxed);
         let in_flight = self.in_flight.load(Ordering::Relaxed);
-        LimiterState {
-            limit,
-            available: limit.saturating_sub(in_flight),
-            in_flight,
-        }
+        LimiterState { limit, in_flight }
     }
 }
 
@@ -414,13 +423,6 @@ impl<'t> Token<'t> {
         }
     }
 
-    #[cfg(test)]
-    pub fn set_latency(&mut self, latency: Duration) {
-        use std::ops::Sub;
-
-        self.start = Instant::now().sub(latency);
-    }
-
     pub fn forget(&mut self) {
         if let Some(permit) = self.permit.take() {
             permit.forget();
@@ -439,10 +441,6 @@ impl LimiterState {
     pub fn limit(&self) -> usize {
         self.limit
     }
-    /// The amount of concurrency available to use.
-    pub fn available(&self) -> usize {
-        self.available
-    }
     /// The number of jobs in flight.
     pub fn in_flight(&self) -> usize {
         self.in_flight
@@ -490,9 +488,11 @@ impl reqwest_middleware::Middleware for Limiter {
 
 #[cfg(test)]
 mod tests {
-    use std::{pin::pin, task::Context, time::Duration};
+    use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
 
     use futures::{task::noop_waker_ref, Future};
+    use rand::SeedableRng;
+    use rustc_hash::FxHasher;
     use smol_str::SmolStr;
     use tokio::time;
 
@@ -690,4 +690,21 @@ mod tests {
             assert!(limiter.check(endpoint.clone()));
         }
     }
+
+    #[tokio::test]
+    async fn test_rate_limits_gc() {
+        // fixed seeded random/hasher to ensure that the test is not flaky
+        let rand = rand::rngs::StdRng::from_seed([1; 32]);
+        let hasher = BuildHasherDefault::<FxHasher>::default();
+
+        let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
+            &RateBucketInfo::DEFAULT_SET,
+            rand,
+            hasher,
+        );
+        for i in 0..1_000_000 {
+            limiter.check(format!("{i}").into());
+        }
+        assert!(limiter.map.len() < 150_000);
+    }
 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index cd6184cdee..071add3bca 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -27,15 +27,15 @@ use sync_wrapper::SyncWrapper;
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw {
+    pub struct WebSocketRw<S = Upgraded> {
         #[pin]
-        stream: SyncWrapper<WebSocketStream<Upgraded>>,
+        stream: SyncWrapper<WebSocketStream<S>>,
         bytes: Bytes,
     }
 }
 
-impl WebSocketRw {
-    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+impl<S> WebSocketRw<S> {
+    pub fn new(stream: WebSocketStream<S>) -> Self {
         Self {
             stream: stream.into(),
             bytes: Bytes::new(),
@@ -43,7 +43,7 @@ impl WebSocketRw {
     }
 }
 
-impl AsyncWrite for WebSocketRw {
+impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
     fn poll_write(
         self: Pin<&mut Self>,
         cx: &mut Context<'_>,
@@ -69,7 +69,7 @@ impl AsyncWrite for WebSocketRw {
     }
 }
 
-impl AsyncRead for WebSocketRw {
+impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
     fn poll_read(
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
@@ -86,7 +86,7 @@ impl AsyncRead for WebSocketRw {
     }
 }
 
-impl AsyncBufRead for WebSocketRw {
+impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
         // Please refer to poll_fill_buf's documentation.
         const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
@@ -151,3 +151,60 @@ pub async fn serve_websocket(
     .await?;
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use std::pin::pin;
+
+    use futures::{SinkExt, StreamExt};
+    use hyper_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };
+    use tokio::{
+        io::{duplex, AsyncReadExt, AsyncWriteExt},
+        task::JoinSet,
+    };
+
+    use super::WebSocketRw;
+
+    #[tokio::test]
+    async fn websocket_stream_wrapper_happy_path() {
+        let (stream1, stream2) = duplex(1024);
+
+        let mut js = JoinSet::new();
+
+        js.spawn(async move {
+            let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await;
+
+            client
+                .send(Message::Binary(b"hello world".to_vec()))
+                .await
+                .unwrap();
+
+            let message = client.next().await.unwrap().unwrap();
+            assert_eq!(message, Message::Binary(b"websockets are cool".to_vec()));
+
+            client.close(None).await.unwrap();
+        });
+
+        js.spawn(async move {
+            let mut rw = pin!(WebSocketRw::new(
+                WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
+            ));
+
+            let mut buf = vec![0; 1024];
+            let n = rw.read(&mut buf).await.unwrap();
+            assert_eq!(&buf[..n], b"hello world");
+
+            rw.write_all(b"websockets are cool").await.unwrap();
+            rw.flush().await.unwrap();
+
+            let n = rw.read_to_end(&mut buf).await.unwrap();
+            assert_eq!(n, 0);
+        });
+
+        js.join_next().await.unwrap().unwrap();
+        js.join_next().await.unwrap().unwrap();
+    }
+}

From bd1cb1b217f39c8d109ccca9c285bc1da959620e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 15:08:28 +0000
Subject: [PATCH 105/202] tests: update allow list for `negative_env` (#6144)

Tests attaching the tenant immediately after the fixture detaches it
could result in LSN updates failing validation

e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6142/7211196140/index.html#suites/7745dadbd815ab87f5798aa881796f47/32b12ccc0b01b122
---
 test_runner/regress/test_attach_tenant_config.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index d2bd53d8aa..352ec13884 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -52,7 +52,16 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
         TenantId(t["id"]) for t in ps_http.tenant_list()
     ], "tenant should not be attached after negative test"
 
-    env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request")
+    env.pageserver.allowed_errors.extend(
+        [
+            # This fixture detaches the tenant, and tests using it will tend to re-attach it
+            # shortly after. There may be un-processed deletion_queue validations from the
+            # initial attachment
+            ".*Dropped remote consistent LSN updates.*",
+            # This fixture is for tests that will intentionally generate 400 responses
+            ".*Error processing HTTP request: Bad request",
+        ]
+    )
 
     def log_contains_bad_request():
         env.pageserver.log_contains(".*Error processing HTTP request: Bad request")

From e62569a8788322d04d9eb397c1c35a670df37fdb Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 12 Dec 2023 00:22:23 +0300
Subject: [PATCH 106/202] A few comments on rust walproposer build.

---
 libs/walproposer/build.rs            | 3 +++
 libs/walproposer/src/api_bindings.rs | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index d32c8ab299..fd09030dbd 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -1,3 +1,6 @@
+//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
+//! to generate Rust bindings for it.
+
 use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 7f1bbc3b80..77afe1e686 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -1,3 +1,6 @@
+//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
+//! callback_data stores Box to some Rust implementation.
+
 #![allow(dead_code)]
 
 use std::ffi::CStr;

From de1a9c6e3b5d40df8c36a0a77f9d2cfad1c3e35b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 15:48:55 +0000
Subject: [PATCH 107/202] s3_scrubber: basic support for sharding (#6119)

This doesn't make the scrubber smart enough to understand that many
shards are part of the same tenants, but it makes it understand paths
well enough to scrub the individual shards without thinking they're
malformed.

This is a prerequisite to being able to run tests with sharding enabled.

Related: #5929
---
 Cargo.lock                         |  1 +
 s3_scrubber/Cargo.toml             |  1 +
 s3_scrubber/src/checks.rs          |  9 +++----
 s3_scrubber/src/garbage.rs         | 38 +++++++++++++++-------------
 s3_scrubber/src/lib.rs             | 40 ++++++++++++++++++++++++++----
 s3_scrubber/src/metadata_stream.rs | 15 +++++------
 s3_scrubber/src/scan_metadata.rs   | 18 ++++++--------
 7 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7bf9209395..08e6961408 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4292,6 +4292,7 @@ dependencies = [
  "histogram",
  "itertools",
  "pageserver",
+ "pageserver_api",
  "rand 0.8.5",
  "remote_storage",
  "reqwest",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index e26f2c6d6b..fdae378d55 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -31,6 +31,7 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
 aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] }
 
 pageserver = { path = "../pageserver" }
+pageserver_api = { path = "../libs/pageserver_api" }
 remote_storage = { path = "../libs/remote_storage" }
 
 tracing.workspace = true
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index a15a908212..2acbb2352b 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -7,13 +7,12 @@ use utils::generation::Generation;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget};
+use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::{pin_mut, StreamExt};
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
-use utils::id::TenantTimelineId;
 
 pub(crate) struct TimelineAnalysis {
     /// Anomalies detected
@@ -39,8 +38,8 @@ impl TimelineAnalysis {
     }
 }
 
-pub(crate) async fn branch_cleanup_and_check_errors(
-    id: &TenantTimelineId,
+pub(crate) fn branch_cleanup_and_check_errors(
+    id: &TenantShardTimelineId,
     s3_root: &RootTarget,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
@@ -238,7 +237,7 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), St
 
 pub(crate) async fn list_timeline_blobs(
     s3_client: &Client,
-    id: TenantTimelineId,
+    id: TenantShardTimelineId,
     s3_root: &RootTarget,
 ) -> anyhow::Result<S3TimelineBlobData> {
     let mut s3_layers = HashSet::new();
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index f27e1d7f65..7192afb91b 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -10,15 +10,16 @@ use aws_sdk_s3::{
     Client,
 };
 use futures_util::{pin_mut, TryStreamExt};
+use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
-use utils::id::{TenantId, TenantTimelineId};
+use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote,
     metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -29,8 +30,8 @@ enum GarbageReason {
 
 #[derive(Serialize, Deserialize, Debug)]
 enum GarbageEntity {
-    Tenant(TenantId),
-    Timeline(TenantTimelineId),
+    Tenant(TenantShardId),
+    Timeline(TenantShardTimelineId),
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -142,6 +143,9 @@ async fn find_garbage_inner(
         console_projects.len()
     );
 
+    // TODO(sharding): batch calls into Console so that we only call once for each TenantId,
+    // rather than checking the same TenantId for multiple TenantShardId
+
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
     let tenants = stream_tenants(&s3_client, &target);
@@ -149,10 +153,10 @@ async fn find_garbage_inner(
         let api_client = cloud_admin_api_client.clone();
         let console_projects = &console_projects;
         async move {
-            match console_projects.get(&t) {
+            match console_projects.get(&t.tenant_id) {
                 Some(project_data) => Ok((t, Some(project_data.clone()))),
                 None => api_client
-                    .find_tenant_project(t)
+                    .find_tenant_project(t.tenant_id)
                     .await
                     .map_err(|e| anyhow::anyhow!(e))
                     .map(|r| (t, r)),
@@ -166,21 +170,21 @@ async fn find_garbage_inner(
     // checks if they are enabled by the `depth` parameter.
     pin_mut!(tenants_checked);
     let mut garbage = GarbageList::new(node_kind, bucket_config);
-    let mut active_tenants: Vec<TenantId> = vec![];
+    let mut active_tenants: Vec<TenantShardId> = vec![];
     let mut counter = 0;
     while let Some(result) = tenants_checked.next().await {
-        let (tenant_id, console_result) = result?;
+        let (tenant_shard_id, console_result) = result?;
 
         // Paranoia check
         if let Some(project) = &console_result {
-            assert!(project.tenant == tenant_id);
+            assert!(project.tenant == tenant_shard_id.tenant_id);
         }
 
-        if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) {
-            tracing::debug!("Tenant {tenant_id} is garbage");
+        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
+            tracing::debug!("Tenant {tenant_shard_id} is garbage");
         } else {
-            tracing::debug!("Tenant {tenant_id} is active");
-            active_tenants.push(tenant_id);
+            tracing::debug!("Tenant {tenant_shard_id} is active");
+            active_tenants.push(tenant_shard_id);
         }
 
         counter += 1;
@@ -266,13 +270,13 @@ impl std::fmt::Display for PurgeMode {
 pub async fn get_tenant_objects(
     s3_client: &Arc<Client>,
     target: RootTarget,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
 ) -> anyhow::Result<Vec<ObjectIdentifier>> {
-    tracing::debug!("Listing objects in tenant {tenant_id}");
+    tracing::debug!("Listing objects in tenant {tenant_shard_id}");
     // TODO: apply extra validation based on object modification time.  Don't purge
     // tenants where any timeline's index_part.json has been touched recently.
 
-    let mut tenant_root = target.tenant_root(&tenant_id);
+    let mut tenant_root = target.tenant_root(&tenant_shard_id);
 
     // Remove delimiter, so that object listing lists all keys in the prefix and not just
     // common prefixes.
@@ -285,7 +289,7 @@ pub async fn get_tenant_objects(
 pub async fn get_timeline_objects(
     s3_client: &Arc<Client>,
     target: RootTarget,
-    ttid: TenantTimelineId,
+    ttid: TenantShardTimelineId,
 ) -> anyhow::Result<Vec<ObjectIdentifier>> {
     tracing::debug!("Listing objects in timeline {ttid}");
     let mut timeline_root = target.timeline_root(&ttid);
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 6607db21e6..d2338c21e5 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -22,6 +22,7 @@ use aws_sdk_s3::{Client, Config};
 
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
+use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use std::io::IsTerminal;
@@ -29,7 +30,7 @@ use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::id::{TenantId, TenantTimelineId};
+use utils::id::TimelineId;
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -44,6 +45,35 @@ pub struct S3Target {
     pub delimiter: String,
 }
 
+/// Convenience for referring to timelines within a particular shard: more ergonomic
+/// than using a 2-tuple.
+///
+/// This is the shard-aware equivalent of TenantTimelineId.  It's defined here rather
+/// than somewhere more broadly exposed, because this kind of thing is rarely needed
+/// in the pageserver, as all timeline objects existing in the scope of a particular
+/// tenant: the scrubber is different in that it handles collections of data referring to many
+/// TenantShardTimelineIds in on place.
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct TenantShardTimelineId {
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+}
+
+impl TenantShardTimelineId {
+    fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self {
+        Self {
+            tenant_shard_id,
+            timeline_id,
+        }
+    }
+}
+
+impl Display for TenantShardTimelineId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id)
+    }
+}
+
 #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
 pub enum TraversingDepth {
     Tenant,
@@ -110,19 +140,19 @@ impl RootTarget {
         }
     }
 
-    pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target {
+    pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
         self.tenants_root().with_sub_segment(&tenant_id.to_string())
     }
 
-    pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target {
+    pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
         match self {
             Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
             Self::Safekeeper(_) => self.tenant_root(tenant_id),
         }
     }
 
-    pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target {
-        self.timelines_root(&id.tenant_id)
+    pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target {
+        self.timelines_root(&id.tenant_shard_id)
             .with_sub_segment(&id.timeline_id.to_string())
     }
 
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index 4cfa77cfc1..073f37f319 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -3,14 +3,15 @@ use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use tokio_stream::Stream;
 
-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId};
-use utils::id::{TenantTimelineId, TimelineId};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
+use pageserver_api::shard::TenantShardId;
+use utils::id::TimelineId;
 
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
     s3_client: &'a Client,
     target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
+) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
     try_stream! {
         let mut continuation_token = None;
         let tenants_target = target.tenants_root();
@@ -44,14 +45,14 @@ pub fn stream_tenants<'a>(
     }
 }
 
-/// Given a TenantId, output a stream of the timelines within that tenant, discovered
+/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
     s3_client: &'a Client,
     target: &'a RootTarget,
-    tenant: TenantId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantTimelineId, anyhow::Error>> + 'a> {
+    tenant: TenantShardId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
     let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
     let mut continuation_token = None;
     let timelines_target = target.timelines_root(&tenant);
@@ -98,7 +99,7 @@ pub async fn stream_tenant_timelines<'a>(
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;
-            yield Ok(TenantTimelineId::new(tenant, id));
+            yield Ok(TenantShardTimelineId::new(tenant, id));
         }
     })
 }
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 228f8d6763..91347ca21b 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -5,20 +5,19 @@ use crate::checks::{
     TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::IndexPart;
 use serde::Serialize;
-use utils::id::TenantTimelineId;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
     count: usize,
-    with_errors: HashSet<TenantTimelineId>,
-    with_warnings: HashSet<TenantTimelineId>,
-    with_garbage: HashSet<TenantTimelineId>,
+    with_errors: HashSet<TenantShardTimelineId>,
+    with_warnings: HashSet<TenantShardTimelineId>,
+    with_garbage: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
 
     layer_count: MinMaxHisto,
@@ -132,7 +131,7 @@ impl MetadataSummary {
         }
     }
 
-    fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) {
+    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
         if !analysis.errors.is_empty() {
             self.with_errors.insert(*id);
         }
@@ -199,8 +198,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
     async fn report_on_timeline(
         s3_client: &Client,
         target: &RootTarget,
-        ttid: TenantTimelineId,
-    ) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> {
+        ttid: TenantShardTimelineId,
+    ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
         let data = list_timeline_blobs(s3_client, ttid, target).await?;
         Ok((ttid, data))
     }
@@ -213,8 +212,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
         let (ttid, data) = i?;
         summary.update_data(&data);
 
-        let analysis =
-            branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await;
+        let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
 
         summary.update_analysis(&ttid, &analysis);
     }

From 1a9854bfb76712a2d72b10db966537ecbe518ab5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Dec 2023 18:33:45 +0100
Subject: [PATCH 108/202] add a Rust client for Pageserver management API
 (#6127)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771

This PR moves the control plane's spread-all-over-the-place client for
the pageserver management API into a separate module within the
pageserver crate.

I need that client to be async in my benchmarking work, so, this PR
switches to the async version of `reqwest`.
That is also the right direction generally IMO.

The switch to async in turn mandated converting most of the
`control_plane/` code to async.

Note that some of the client methods should be taking `TenantShardId`
instead of `TenantId`, but, none of the callers seem to be
sharding-aware.
Leaving that for another time:
https://github.com/neondatabase/neon/issues/6154
---
 Cargo.lock                              |  18 ++
 Cargo.toml                              |   2 +
 control_plane/Cargo.toml                |   7 +-
 control_plane/src/attachment_service.rs |  23 +-
 control_plane/src/background_process.rs |  35 ++-
 control_plane/src/bin/neon_local.rs     | 163 ++++++++------
 control_plane/src/broker.rs             |  11 +-
 control_plane/src/endpoint.rs           |  24 +-
 control_plane/src/pageserver.rs         | 282 +++++++++---------------
 control_plane/src/safekeeper.rs         |  40 ++--
 control_plane/src/tenant_migration.rs   |  42 ++--
 libs/postgres_connection/src/lib.rs     |  14 +-
 pageserver/client/Cargo.toml            |  14 ++
 pageserver/client/src/lib.rs            |   1 +
 pageserver/client/src/mgmt_api.rs       | 172 +++++++++++++++
 15 files changed, 529 insertions(+), 319 deletions(-)
 create mode 100644 pageserver/client/Cargo.toml
 create mode 100644 pageserver/client/src/lib.rs
 create mode 100644 pageserver/client/src/mgmt_api.rs

diff --git a/Cargo.lock b/Cargo.lock
index 08e6961408..99a936dd40 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1245,16 +1245,19 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-trait",
  "camino",
  "clap",
  "comfy-table",
  "compute_api",
+ "futures",
  "git-version",
  "hex",
  "hyper",
  "nix 0.26.2",
  "once_cell",
  "pageserver_api",
+ "pageserver_client",
  "postgres",
  "postgres_backend",
  "postgres_connection",
@@ -1268,6 +1271,8 @@ dependencies = [
  "tar",
  "thiserror",
  "tokio",
+ "tokio-postgres",
+ "tokio-util",
  "toml",
  "tracing",
  "url",
@@ -3172,6 +3177,19 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "pageserver_client"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "pageserver_api",
+ "reqwest",
+ "serde",
+ "thiserror",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
diff --git a/Cargo.toml b/Cargo.toml
index 496a9d7839..b44544d626 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "control_plane",
     "pageserver",
     "pageserver/ctl",
+    "pageserver/client",
     "proxy",
     "safekeeper",
     "storage_broker",
@@ -182,6 +183,7 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
+pageserver_client = { path = "./pageserver/client" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 7ccddc161e..898ad05add 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,9 +6,11 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
@@ -24,10 +26,11 @@ tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-util.workspace = true
 url.workspace = true
-# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
-# instead, so that recompile times are better.
 pageserver_api.workspace = true
+pageserver_client.workspace = true
 postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 822ac7d8a6..731c05809e 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,7 +9,7 @@ pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
     path: PathBuf,
-    client: reqwest::blocking::Client,
+    client: reqwest::Client,
 }
 
 const COMMAND: &str = "attachment_service";
@@ -53,7 +53,7 @@ impl AttachmentService {
             env: env.clone(),
             path,
             listen,
-            client: reqwest::blocking::ClientBuilder::new()
+            client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
         }
@@ -64,7 +64,7 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub async fn start(&self) -> anyhow::Result<Child> {
         let path_str = self.path.to_string_lossy();
 
         background_process::start_process(
@@ -73,10 +73,11 @@ impl AttachmentService {
             &self.env.attachment_service_bin(),
             ["-l", &self.listen, "-p", &path_str],
             [],
-            background_process::InitialPidFile::Create(&self.pid_file()),
+            background_process::InitialPidFile::Create(self.pid_file()),
             // TODO: a real status check
-            || Ok(true),
+            || async move { anyhow::Ok(true) },
         )
+        .await
     }
 
     pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
@@ -84,7 +85,7 @@ impl AttachmentService {
     }
 
     /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub fn attach_hook(
+    pub async fn attach_hook(
         &self,
         tenant_id: TenantId,
         pageserver_id: NodeId,
@@ -104,16 +105,16 @@ impl AttachmentService {
             node_id: Some(pageserver_id),
         };
 
-        let response = self.client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send().await?;
         if response.status() != StatusCode::OK {
             return Err(anyhow!("Unexpected status {}", response.status()));
         }
 
-        let response = response.json::<AttachHookResponse>()?;
+        let response = response.json::<AttachHookResponse>().await?;
         Ok(response.gen)
     }
 
-    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
         use hyper::StatusCode;
 
         let url = self
@@ -126,12 +127,12 @@ impl AttachmentService {
 
         let request = InspectRequest { tenant_id };
 
-        let response = self.client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send().await?;
         if response.status() != StatusCode::OK {
             return Err(anyhow!("Unexpected status {}", response.status()));
         }
 
-        let response = response.json::<InspectResponse>()?;
+        let response = response.json::<InspectResponse>().await?;
         Ok(response.attachment)
     }
 }
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 26fc08fc8f..20fa3af9b8 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
 
 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
-pub enum InitialPidFile<'t> {
+pub enum InitialPidFile {
     /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Utf8Path),
+    Create(Utf8PathBuf),
     /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Utf8Path),
+    Expect(Utf8PathBuf),
 }
 
 /// Start a background child process using the parameters given.
-pub fn start_process<F, AI, A, EI>(
+pub async fn start_process<F, Fut, AI, A, EI>(
     process_name: &str,
     datadir: &Path,
     command: &Path,
@@ -62,7 +62,8 @@ pub fn start_process<F, AI, A, EI>(
     process_status_check: F,
 ) -> anyhow::Result<Child>
 where
-    F: Fn() -> anyhow::Result<bool>,
+    F: Fn() -> Fut,
+    Fut: std::future::Future<Output = anyhow::Result<bool>>,
     AI: IntoIterator<Item = A>,
     A: AsRef<OsStr>,
     // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
@@ -89,7 +90,7 @@ where
     let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
     filled_cmd.envs(envs);
 
-    let pid_file_to_check = match initial_pid_file {
+    let pid_file_to_check = match &initial_pid_file {
         InitialPidFile::Create(path) => {
             pre_exec_create_pidfile(filled_cmd, path);
             path
@@ -107,7 +108,7 @@ where
     );
 
     for retries in 0..RETRIES {
-        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
+        match process_started(pid, pid_file_to_check, &process_status_check).await {
             Ok(true) => {
                 println!("\n{process_name} started, pid: {pid}");
                 return Ok(spawned_process);
@@ -316,22 +317,20 @@ where
     cmd
 }
 
-fn process_started<F>(
+async fn process_started<F, Fut>(
     pid: Pid,
-    pid_file_to_check: Option<&Utf8Path>,
+    pid_file_to_check: &Utf8Path,
     status_check: &F,
 ) -> anyhow::Result<bool>
 where
-    F: Fn() -> anyhow::Result<bool>,
+    F: Fn() -> Fut,
+    Fut: std::future::Future<Output = anyhow::Result<bool>>,
 {
-    match status_check() {
-        Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
-                PidFileRead::NotExist => Ok(false),
-                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
-                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
-            },
-            None => Ok(true),
+    match status_check().await {
+        Ok(true) => match pid_file::read(pid_file_to_check)? {
+            PidFileRead::NotExist => Ok(false),
+            PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+            PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
         },
         Ok(false) => Ok(false),
         Err(e) => anyhow::bail!("process failed to start: {e}"),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6f0b929ac6..03e69010f7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -120,15 +120,20 @@ fn main() -> Result<()> {
         let mut env = LocalEnv::load_config().context("Error loading config")?;
         let original_env = env.clone();
 
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+
         let subcommand_result = match sub_name {
-            "tenant" => handle_tenant(sub_args, &mut env),
-            "timeline" => handle_timeline(sub_args, &mut env),
-            "start" => handle_start_all(sub_args, &env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
+            "start" => rt.block_on(handle_start_all(sub_args, &env)),
             "stop" => handle_stop_all(sub_args, &env),
-            "pageserver" => handle_pageserver(sub_args, &env),
-            "attachment_service" => handle_attachment_service(sub_args, &env),
-            "safekeeper" => handle_safekeeper(sub_args, &env),
-            "endpoint" => handle_endpoint(sub_args, &env),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
+            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
             "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
             _ => bail!("unexpected subcommand {sub_name}"),
@@ -269,12 +274,13 @@ fn print_timeline(
 
 /// Returns a map of timeline IDs to timeline_id@lsn strings.
 /// Connects to the pageserver to query this information.
-fn get_timeline_infos(
+async fn get_timeline_infos(
     env: &local_env::LocalEnv,
     tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
     Ok(get_default_pageserver(env)
-        .timeline_list(tenant_id)?
+        .timeline_list(tenant_id)
+        .await?
         .into_iter()
         .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
         .collect())
@@ -373,11 +379,14 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
         .collect()
 }
 
-fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_tenant(
+    tenant_match: &ArgMatches,
+    env: &mut local_env::LocalEnv,
+) -> anyhow::Result<()> {
     let pageserver = get_default_pageserver(env);
     match tenant_match.subcommand() {
         Some(("list", _)) => {
-            for t in pageserver.tenant_list()? {
+            for t in pageserver.tenant_list().await? {
                 println!("{} {:?}", t.id, t.state);
             }
         }
@@ -394,12 +403,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                 // We must register the tenant with the attachment service, so
                 // that when the pageserver restarts, it will be re-attached.
                 let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+                attachment_service
+                    .attach_hook(tenant_id, pageserver.conf.id)
+                    .await?
             } else {
                 None
             };
 
-            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
+            pageserver
+                .tenant_create(tenant_id, generation, tenant_conf)
+                .await?;
             println!("tenant {tenant_id} successfully created on the pageserver");
 
             // Create an initial timeline for the new tenant
@@ -409,14 +422,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                 .copied()
                 .context("Failed to parse postgres version from the argument string")?;
 
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    new_timeline_id,
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )
+                .await?;
             let new_timeline_id = timeline_info.timeline_id;
             let last_record_lsn = timeline_info.last_record_lsn;
 
@@ -450,6 +465,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 
             pageserver
                 .tenant_config(tenant_id, tenant_conf)
+                .await
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
             println!("tenant {tenant_id} successfully configured on the pageserver");
         }
@@ -458,7 +474,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
             let new_pageserver = get_pageserver(env, matches)?;
             let new_pageserver_id = new_pageserver.conf.id;
 
-            migrate_tenant(env, tenant_id, new_pageserver)?;
+            migrate_tenant(env, tenant_id, new_pageserver).await?;
             println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
         }
 
@@ -468,13 +484,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
     Ok(())
 }
 
-fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
     let pageserver = get_default_pageserver(env);
 
     match timeline_match.subcommand() {
         Some(("list", list_match)) => {
             let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver.timeline_list(&tenant_id)?;
+            let timelines = pageserver.timeline_list(&tenant_id).await?;
             print_timelines_tree(timelines, env.timeline_name_mappings())?;
         }
         Some(("create", create_match)) => {
@@ -490,14 +506,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
 
             let new_timeline_id_opt = parse_timeline_id(create_match)?;
 
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id_opt,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    new_timeline_id_opt,
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )
+                .await?;
             let new_timeline_id = timeline_info.timeline_id;
 
             let last_record_lsn = timeline_info.last_record_lsn;
@@ -542,7 +560,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
 
             let mut cplane = ComputeControlPlane::load(env.clone())?;
             println!("Importing timeline into pageserver ...");
-            pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
+            pageserver
+                .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
+                .await?;
             env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
 
             println!("Creating endpoint for imported timeline ...");
@@ -578,14 +598,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                 .map(|lsn_str| Lsn::from_str(lsn_str))
                 .transpose()
                 .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                None,
-                start_lsn,
-                Some(ancestor_timeline_id),
-                None,
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    None,
+                    start_lsn,
+                    Some(ancestor_timeline_id),
+                    None,
+                    None,
+                )
+                .await?;
             let new_timeline_id = timeline_info.timeline_id;
 
             let last_record_lsn = timeline_info.last_record_lsn;
@@ -604,7 +626,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
     Ok(())
 }
 
-fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let (sub_name, sub_args) = match ep_match.subcommand() {
         Some(ep_subcommand_data) => ep_subcommand_data,
         None => bail!("no endpoint subcommand provided"),
@@ -614,10 +636,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
     match sub_name {
         "list" => {
             let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
-                eprintln!("Failed to load timeline info: {}", e);
-                HashMap::new()
-            });
+            let timeline_infos = get_timeline_infos(env, &tenant_id)
+                .await
+                .unwrap_or_else(|e| {
+                    eprintln!("Failed to load timeline info: {}", e);
+                    HashMap::new()
+                });
 
             let timeline_name_mappings = env.timeline_name_mappings();
 
@@ -791,7 +815,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
             };
 
             println!("Starting existing endpoint {endpoint_id}...");
-            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
+            endpoint
+                .start(&auth_token, safekeepers, remote_ext_config)
+                .await?;
         }
         "reconfigure" => {
             let endpoint_id = sub_args
@@ -809,7 +835,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 } else {
                     None
                 };
-            endpoint.reconfigure(pageserver_id)?;
+            endpoint.reconfigure(pageserver_id).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
@@ -875,11 +901,12 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
     ))
 }
 
-fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
             if let Err(e) = get_pageserver(env, subcommand_args)?
                 .start(&pageserver_config_overrides(subcommand_args))
+                .await
             {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
@@ -906,7 +933,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                 exit(1);
             }
 
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
@@ -920,14 +950,17 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                 exit(1);
             }
 
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
         }
 
         Some(("status", subcommand_args)) => {
-            match get_pageserver(env, subcommand_args)?.check_status() {
+            match get_pageserver(env, subcommand_args)?.check_status().await {
                 Ok(_) => println!("Page server is up and running"),
                 Err(err) => {
                     eprintln!("Page server is not available: {}", err);
@@ -942,11 +975,14 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
     Ok(())
 }
 
-fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_attachment_service(
+    sub_match: &ArgMatches,
+    env: &local_env::LocalEnv,
+) -> Result<()> {
     let svc = AttachmentService::from_env(env);
     match sub_match.subcommand() {
         Some(("start", _start_match)) => {
-            if let Err(e) = svc.start() {
+            if let Err(e) = svc.start().await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
@@ -987,7 +1023,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
         .collect()
 }
 
-fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let (sub_name, sub_args) = match sub_match.subcommand() {
         Some(safekeeper_command_data) => safekeeper_command_data,
         None => bail!("no safekeeper subcommand provided"),
@@ -1005,7 +1041,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
         "start" => {
             let extra_opts = safekeeper_extra_opts(sub_args);
 
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
@@ -1031,7 +1067,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
             }
 
             let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
@@ -1044,15 +1080,15 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
     Ok(())
 }
 
-fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
     // Endpoints are not started automatically
 
-    broker::start_broker_process(env)?;
+    broker::start_broker_process(env).await?;
 
     // Only start the attachment service if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
         let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start() {
+        if let Err(e) = attachment_service.start().await {
             eprintln!("attachment_service start failed: {:#}", e);
             try_stop_all(env, true);
             exit(1);
@@ -1061,7 +1097,10 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
 
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+        if let Err(e) = pageserver
+            .start(&pageserver_config_overrides(sub_match))
+            .await
+        {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
             try_stop_all(env, true);
             exit(1);
@@ -1070,7 +1109,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]) {
+        if let Err(e) = safekeeper.start(vec![]).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
             try_stop_all(env, false);
             exit(1);
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index 6be865cc2e..f40705863b 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;
 
 use crate::{background_process, local_env};
 
-pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
     let broker = &env.broker;
     let listen_addr = &broker.listen_addr;
 
@@ -19,15 +19,15 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
 
     let args = [format!("--listen-addr={listen_addr}")];
 
-    let client = reqwest::blocking::Client::new();
+    let client = reqwest::Client::new();
     background_process::start_process(
         "storage_broker",
         &env.base_data_dir,
         &env.storage_broker_bin(),
         args,
         [],
-        background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
-        || {
+        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
+        || async {
             let url = broker.client_url();
             let status_url = url.join("status").with_context(|| {
                 format!("Failed to append /status path to broker endpoint {url}")
@@ -36,12 +36,13 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
                 .get(status_url)
                 .build()
                 .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
-            match client.execute(request) {
+            match client.execute(request).await {
                 Ok(resp) => Ok(resp.status().is_success()),
                 Err(_) => Ok(false),
             }
         },
     )
+    .await
     .context("Failed to spawn storage_broker subprocess")?;
     Ok(())
 }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index a566f03db9..071f22dc2b 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -464,7 +464,7 @@ impl Endpoint {
         }
     }
 
-    pub fn start(
+    pub async fn start(
         &self,
         auth_token: &Option<String>,
         safekeepers: Vec<NodeId>,
@@ -587,7 +587,7 @@ impl Endpoint {
         const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
         loop {
             attempt += 1;
-            match self.get_status() {
+            match self.get_status().await {
                 Ok(state) => {
                     match state.status {
                         ComputeStatus::Init => {
@@ -629,8 +629,8 @@ impl Endpoint {
     }
 
     // Call the /status HTTP API
-    pub fn get_status(&self) -> Result<ComputeState> {
-        let client = reqwest::blocking::Client::new();
+    pub async fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::Client::new();
 
         let response = client
             .request(
@@ -641,16 +641,17 @@ impl Endpoint {
                     self.http_address.port()
                 ),
             )
-            .send()?;
+            .send()
+            .await?;
 
         // Interpret the response
         let status = response.status();
         if !(status.is_client_error() || status.is_server_error()) {
-            Ok(response.json()?)
+            Ok(response.json().await?)
         } else {
             // reqwest does not export its error construction utility functions, so let's craft the message ourselves
             let url = response.url().to_owned();
-            let msg = match response.text() {
+            let msg = match response.text().await {
                 Ok(err_body) => format!("Error: {}", err_body),
                 Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
             };
@@ -658,7 +659,7 @@ impl Endpoint {
         }
     }
 
-    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
             let file = std::fs::File::open(spec_path)?;
@@ -687,7 +688,7 @@ impl Endpoint {
             spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
         }
 
-        let client = reqwest::blocking::Client::new();
+        let client = reqwest::Client::new();
         let response = client
             .post(format!(
                 "http://{}:{}/configure",
@@ -698,14 +699,15 @@ impl Endpoint {
                 "{{\"spec\":{}}}",
                 serde_json::to_string_pretty(&spec)?
             ))
-            .send()?;
+            .send()
+            .await?;
 
         let status = response.status();
         if !(status.is_client_error() || status.is_server_error()) {
             Ok(())
         } else {
             let url = response.url().to_owned();
-            let msg = match response.text() {
+            let msg = match response.text().await {
                 Ok(err_body) => format!("Error: {}", err_body),
                 Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
             };
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 981ddd3e98..7d490016bf 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -6,28 +6,24 @@
 //!
 use std::borrow::Cow;
 use std::collections::HashMap;
-use std::fs::File;
-use std::io::{BufReader, Write};
+
+use std::io;
+use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::time::Duration;
-use std::{io, result};
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{
-    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
-};
+use futures::SinkExt;
+use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, RequestBuilder, Response};
-use reqwest::{IntoUrl, Method};
-use thiserror::Error;
 use utils::auth::{Claims, Scope};
 use utils::{
-    http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
     lsn::Lsn,
 };
@@ -38,45 +34,6 @@ use crate::{background_process, local_env::LocalEnv};
 /// Directory within .neon which will be used by default for LocalFs remote storage.
 pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
 
-#[derive(Error, Debug)]
-pub enum PageserverHttpError {
-    #[error("Reqwest error: {0}")]
-    Transport(#[from] reqwest::Error),
-
-    #[error("Error: {0}")]
-    Response(String),
-}
-
-impl From<anyhow::Error> for PageserverHttpError {
-    fn from(e: anyhow::Error) -> Self {
-        Self::Response(e.to_string())
-    }
-}
-
-type Result<T> = result::Result<T, PageserverHttpError>;
-
-pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
-}
-
-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
-        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            return Ok(self);
-        }
-
-        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
-        let url = self.url().to_owned();
-        Err(PageserverHttpError::Response(
-            match self.json::<HttpErrorBody>() {
-                Ok(err_body) => format!("Error: {}", err_body.msg),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            },
-        ))
-    }
-}
-
 //
 // Control routines for pageserver.
 //
@@ -87,8 +44,7 @@ pub struct PageServerNode {
     pub pg_connection_config: PgConnectionConfig,
     pub conf: PageServerConf,
     pub env: LocalEnv,
-    pub http_client: Client,
-    pub http_base_url: String,
+    pub http_client: mgmt_api::Client,
 }
 
 impl PageServerNode {
@@ -100,8 +56,19 @@ impl PageServerNode {
             pg_connection_config: PgConnectionConfig::new_host_port(host, port),
             conf: conf.clone(),
             env: env.clone(),
-            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
+            http_client: mgmt_api::Client::new(
+                format!("http://{}", conf.listen_http_addr),
+                {
+                    match conf.http_auth_type {
+                        AuthType::Trust => None,
+                        AuthType::NeonJWT => Some(
+                            env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                                .unwrap(),
+                        ),
+                    }
+                }
+                .as_deref(),
+            ),
         }
     }
 
@@ -182,8 +149,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, false)
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+        self.start_node(config_overrides, false).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -224,7 +191,12 @@ impl PageServerNode {
         Ok(())
     }
 
-    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
+    async fn start_node(
+        &self,
+        config_overrides: &[&str],
+        update_config: bool,
+    ) -> anyhow::Result<Child> {
+        // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
             "Starting pageserver node {} at '{}' in {:?}",
@@ -232,7 +204,7 @@ impl PageServerNode {
             self.pg_connection_config.raw_address(),
             datadir
         );
-        io::stdout().flush()?;
+        io::stdout().flush().context("flush stdout")?;
 
         let datadir_path_str = datadir.to_str().with_context(|| {
             format!(
@@ -244,20 +216,23 @@ impl PageServerNode {
         if update_config {
             args.push(Cow::Borrowed("--update-config"));
         }
-
         background_process::start_process(
             "pageserver",
             &datadir,
             &self.env.pageserver_bin(),
             args.iter().map(Cow::as_ref),
             self.pageserver_env_variables()?,
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(PageserverHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            background_process::InitialPidFile::Expect(self.pid_file()),
+            || async {
+                let st = self.check_status().await;
+                match st {
+                    Ok(()) => Ok(true),
+                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
+                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                }
             },
         )
+        .await
     }
 
     fn pageserver_basic_args<'a>(
@@ -303,7 +278,12 @@ impl PageServerNode {
         background_process::stop_process(immediate, "pageserver", &self.pid_file())
     }
 
-    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
+    pub async fn page_server_psql_client(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
         let mut config = self.pg_connection_config.clone();
         if self.conf.pg_auth_type == AuthType::NeonJWT {
             let token = self
@@ -311,36 +291,18 @@ impl PageServerNode {
                 .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
             config = config.set_password(Some(token));
         }
-        Ok(config.connect_no_tls()?)
+        Ok(config.connect_no_tls().await?)
     }
 
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
-        let mut builder = self.http_client.request(method, url);
-        if self.conf.http_auth_type == AuthType::NeonJWT {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-            builder = builder.bearer_auth(token)
-        }
-        Ok(builder)
+    pub async fn check_status(&self) -> mgmt_api::Result<()> {
+        self.http_client.status().await
     }
 
-    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
-            .send()?
-            .error_from_body()?;
-        Ok(())
+    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
+        self.http_client.list_tenants().await
     }
 
-    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
-        Ok(self
-            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
-            .send()?
-            .error_from_body()?
-            .json()?)
-    }
-
-    pub fn tenant_create(
+    pub async fn tenant_create(
         &self,
         new_tenant_id: TenantId,
         generation: Option<u32>,
@@ -418,23 +380,10 @@ impl PageServerNode {
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
         }
-        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
-            .json(&request)
-            .send()?
-            .error_from_body()?
-            .json::<Option<String>>()
-            .with_context(|| {
-                format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
-            })?
-            .context("No tenant id was found in the tenant creation response")
-            .and_then(|tenant_id_string| {
-                tenant_id_string.parse().with_context(|| {
-                    format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
-                })
-            })
+        Ok(self.http_client.tenant_create(&request).await?)
     }
 
-    pub fn tenant_config(
+    pub async fn tenant_config(
         &self,
         tenant_id: TenantId,
         mut settings: HashMap<&str, &str>,
@@ -513,54 +462,30 @@ impl PageServerNode {
             bail!("Unrecognized tenant settings: {settings:?}")
         }
 
-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
-            .json(&models::TenantConfigRequest { tenant_id, config })
-            .send()?
-            .error_from_body()?;
+        self.http_client
+            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .await?;
 
         Ok(())
     }
 
-    pub fn location_config(
+    pub async fn location_config(
         &self,
         tenant_id: TenantId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
     ) -> anyhow::Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
-
-        let path = format!(
-            "{}/tenant/{}/location_config",
-            self.http_base_url, tenant_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-
-        self.http_request(Method::PUT, path)?
-            .json(&req_body)
-            .send()?
-            .error_from_body()?;
-
-        Ok(())
+        Ok(self
+            .http_client
+            .location_config(tenant_id, config, flush_ms)
+            .await?)
     }
 
-    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        let timeline_infos: Vec<TimelineInfo> = self
-            .http_request(
-                Method::GET,
-                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )?
-            .send()?
-            .error_from_body()?
-            .json()?;
-
-        Ok(timeline_infos)
+    pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
+        Ok(self.http_client.list_timelines(*tenant_id).await?)
     }
 
-    pub fn timeline_create(
+    pub async fn timeline_create(
         &self,
         tenant_id: TenantId,
         new_timeline_id: Option<TimelineId>,
@@ -571,29 +496,14 @@ impl PageServerNode {
     ) -> anyhow::Result<TimelineInfo> {
         // If timeline ID was not specified, generate one
         let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
-
-        self.http_request(
-            Method::POST,
-            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-        )?
-        .json(&models::TimelineCreateRequest {
+        let req = models::TimelineCreateRequest {
             new_timeline_id,
             ancestor_start_lsn,
             ancestor_timeline_id,
             pg_version,
             existing_initdb_timeline_id,
-        })
-        .send()?
-        .error_from_body()?
-        .json::<Option<TimelineInfo>>()
-        .with_context(|| {
-            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
-        })?
-        .with_context(|| {
-            format!(
-                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
-            )
-        })
+        };
+        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
     }
 
     /// Import a basebackup prepared using either:
@@ -605,7 +515,7 @@ impl PageServerNode {
     /// * `timeline_id` - id to assign to imported timeline
     /// * `base` - (start lsn of basebackup, path to `base.tar` file)
     /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
-    pub fn timeline_import(
+    pub async fn timeline_import(
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
@@ -613,36 +523,60 @@ impl PageServerNode {
         pg_wal: Option<(Lsn, PathBuf)>,
         pg_version: u32,
     ) -> anyhow::Result<()> {
-        let mut client = self.page_server_psql_client()?;
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        tokio::pin!(client);
 
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
-        let base_tarfile = File::open(base_tarfile_path)?;
-        let mut base_reader = BufReader::new(base_tarfile);
+        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
 
         // Init wal reader if necessary
         let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
-            let wal_tarfile = File::open(wal_tarfile_path)?;
-            let wal_reader = BufReader::new(wal_tarfile);
+            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
             (end_lsn, Some(wal_reader))
         } else {
             (start_lsn, None)
         };
 
-        // Import base
-        let import_cmd = format!(
-            "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-        );
-        let mut writer = client.copy_in(&import_cmd)?;
-        io::copy(&mut base_reader, &mut writer)?;
-        writer.finish()?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };
 
+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
         // Import wal if necessary
-        if let Some(mut wal_reader) = wal_reader {
-            let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
-            let mut writer = client.copy_in(&import_cmd)?;
-            io::copy(&mut wal_reader, &mut writer)?;
-            writer.finish()?;
+        if let Some(wal_reader) = wal_reader {
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
         }
 
         Ok(())
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index a8baa0ac53..4026ef0eb9 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -13,7 +13,6 @@ use std::{io, result};
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
-use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};
@@ -34,12 +33,14 @@ pub enum SafekeeperHttpError {
 
 type Result<T> = result::Result<T, SafekeeperHttpError>;
 
+#[async_trait::async_trait]
 pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
+    async fn error_from_body(self) -> Result<Self>;
 }
 
-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
+#[async_trait::async_trait]
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
         let status = self.status();
         if !(status.is_client_error() || status.is_server_error()) {
             return Ok(self);
@@ -48,7 +49,7 @@ impl ResponseErrorMessageExt for Response {
         // reqwest does not export its error construction utility functions, so let's craft the message ourselves
         let url = self.url().to_owned();
         Err(SafekeeperHttpError::Response(
-            match self.json::<HttpErrorBody>() {
+            match self.json::<HttpErrorBody>().await {
                 Ok(err_body) => format!("Error: {}", err_body.msg),
                 Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
             },
@@ -69,7 +70,7 @@ pub struct SafekeeperNode {
 
     pub pg_connection_config: PgConnectionConfig,
     pub env: LocalEnv,
-    pub http_client: Client,
+    pub http_client: reqwest::Client,
     pub http_base_url: String,
 }
 
@@ -80,7 +81,7 @@ impl SafekeeperNode {
             conf: conf.clone(),
             pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
             env: env.clone(),
-            http_client: Client::new(),
+            http_client: reqwest::Client::new(),
             http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
         }
     }
@@ -103,7 +104,7 @@ impl SafekeeperNode {
             .expect("non-Unicode path")
     }
 
-    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
         print!(
             "Starting safekeeper at '{}' in '{}'",
             self.pg_connection_config.raw_address(),
@@ -191,13 +192,16 @@ impl SafekeeperNode {
             &self.env.safekeeper_bin(),
             &args,
             [],
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            background_process::InitialPidFile::Expect(self.pid_file()),
+            || async {
+                match self.check_status().await {
+                    Ok(()) => Ok(true),
+                    Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                }
             },
         )
+        .await
     }
 
     ///
@@ -216,7 +220,7 @@ impl SafekeeperNode {
         )
     }
 
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
         // TODO: authentication
         //if self.env.auth_type == AuthType::NeonJWT {
         //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
@@ -224,10 +228,12 @@ impl SafekeeperNode {
         self.http_client.request(method, url)
     }
 
-    pub fn check_status(&self) -> Result<()> {
+    pub async fn check_status(&self) -> Result<()> {
         self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
-            .send()?
-            .error_from_body()?;
+            .send()
+            .await?
+            .error_from_body()
+            .await?;
         Ok(())
     }
 }
diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs
index fbb0358158..79df108896 100644
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -19,11 +19,11 @@ use utils::{
 };
 
 /// Given an attached pageserver, retrieve the LSN for all timelines
-fn get_lsns(
+async fn get_lsns(
     tenant_id: TenantId,
     pageserver: &PageServerNode,
 ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver.timeline_list(&tenant_id)?;
+    let timelines = pageserver.timeline_list(&tenant_id).await?;
     Ok(timelines
         .into_iter()
         .map(|t| (t.timeline_id, t.last_record_lsn))
@@ -32,13 +32,13 @@ fn get_lsns(
 
 /// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
 /// `baseline`.
-fn await_lsn(
+async fn await_lsn(
     tenant_id: TenantId,
     pageserver: &PageServerNode,
     baseline: HashMap<TimelineId, Lsn>,
 ) -> anyhow::Result<()> {
     loop {
-        let latest = match get_lsns(tenant_id, pageserver) {
+        let latest = match get_lsns(tenant_id, pageserver).await {
             Ok(l) => l,
             Err(e) => {
                 println!(
@@ -84,7 +84,7 @@ fn await_lsn(
 ///  - Coordinate attach/secondary/detach on pageservers
 ///  - call into attachment_service for generations
 ///  - reconfigure compute endpoints to point to new attached pageserver
-pub fn migrate_tenant(
+pub async fn migrate_tenant(
     env: &LocalEnv,
     tenant_id: TenantId,
     dest_ps: PageServerNode,
@@ -108,16 +108,18 @@ pub fn migrate_tenant(
         }
     }
 
-    let previous = attachment_service.inspect(tenant_id)?;
+    let previous = attachment_service.inspect(tenant_id).await?;
     let mut baseline_lsns = None;
     if let Some((generation, origin_ps_id)) = &previous {
         let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
 
         if origin_ps_id == &dest_ps.conf.id {
             println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let gen = attachment_service
+                .attach_hook(tenant_id, dest_ps.conf.id)
+                .await?;
             let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None)?;
+            dest_ps.location_config(tenant_id, dest_conf, None).await?;
             println!("✅ Migration complete");
             return Ok(());
         }
@@ -126,20 +128,24 @@ pub fn migrate_tenant(
 
         let stale_conf =
             build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
+        origin_ps
+            .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;
 
-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
     }
 
-    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let gen = attachment_service
+        .attach_hook(tenant_id, dest_ps.conf.id)
+        .await?;
     let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
 
     println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None).await?;
 
     if let Some(baseline) = baseline_lsns {
         println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline)?;
+        await_lsn(tenant_id, &dest_ps, baseline).await?;
     }
 
     let cplane = ComputeControlPlane::load(env.clone())?;
@@ -149,7 +155,7 @@ pub fn migrate_tenant(
                 "🔁 Reconfiguring endpoint {} to use pageserver {}",
                 endpoint_name, dest_ps.conf.id
             );
-            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
         }
     }
 
@@ -159,7 +165,7 @@ pub fn migrate_tenant(
         }
 
         let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list()?;
+        let other_ps_tenants = other_ps.tenant_list().await?;
 
         // Check if this tenant is attached
         let found = other_ps_tenants
@@ -181,7 +187,9 @@ pub fn migrate_tenant(
             "💤 Switching to secondary mode on pageserver {}",
             other_ps.conf.id
         );
-        other_ps.location_config(tenant_id, secondary_conf, None)?;
+        other_ps
+            .location_config(tenant_id, secondary_conf, None)
+            .await?;
     }
 
     println!(
@@ -189,7 +197,7 @@ pub fn migrate_tenant(
         dest_ps.conf.id
     );
     let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None).await?;
 
     println!("✅ Migration complete");
 
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 35cb1a2691..ccf9108895 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -163,8 +163,18 @@ impl PgConnectionConfig {
     }
 
     /// Connect using postgres protocol with TLS disabled.
-    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
-        postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
+    pub async fn connect_no_tls(
+        &self,
+    ) -> Result<
+        (
+            tokio_postgres::Client,
+            tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+        ),
+        postgres::Error,
+    > {
+        self.to_tokio_postgres_config()
+            .connect(postgres::NoTls)
+            .await
     }
 }
 
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
new file mode 100644
index 0000000000..4bd36185a6
--- /dev/null
+++ b/pageserver/client/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "pageserver_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pageserver_api.workspace = true
+thiserror.workspace = true
+async-trait.workspace = true
+reqwest.workspace = true
+utils.workspace = true
+serde.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
new file mode 100644
index 0000000000..3963fd466c
--- /dev/null
+++ b/pageserver/client/src/lib.rs
@@ -0,0 +1 @@
+pub mod mgmt_api;
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
new file mode 100644
index 0000000000..821c238082
--- /dev/null
+++ b/pageserver/client/src/mgmt_api.rs
@@ -0,0 +1,172 @@
+use pageserver_api::models::*;
+use reqwest::{IntoUrl, Method};
+use utils::{
+    http::error::HttpErrorBody,
+    id::{TenantId, TimelineId},
+};
+
+#[derive(Debug)]
+pub struct Client {
+    mgmt_api_endpoint: String,
+    authorization_header: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("receive body: {0}")]
+    ReceiveBody(reqwest::Error),
+
+    #[error("receive error body: {0}")]
+    ReceiveErrorBody(String),
+
+    #[error("pageserver API: {0}")]
+    ApiError(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[async_trait::async_trait]
+pub trait ResponseErrorMessageExt: Sized {
+    async fn error_from_body(self) -> Result<Self>;
+}
+
+#[async_trait::async_trait]
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(mut self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
+            Err(_) => {
+                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
+            }
+        })
+    }
+}
+
+impl Client {
+    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self {
+            mgmt_api_endpoint,
+            authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
+            client: reqwest::Client::new(),
+        }
+    }
+
+    pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
+    pub async fn list_timelines(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
+        let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn timeline_info(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<pageserver_api::models::TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
+        self.request(Method::GET, uri, ()).await
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        let response = res.error_from_body().await?;
+        Ok(response)
+    }
+
+    pub async fn status(&self) -> Result<()> {
+        let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
+        self.get(&uri).await?;
+        Ok(())
+    }
+
+    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PUT, &uri, req).await?;
+        Ok(())
+    }
+
+    pub async fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+        flush_ms: Option<std::time::Duration>,
+    ) -> Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+        let path = format!(
+            "{}/v1/tenant/{}/location_config",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+        self.request(Method::PUT, &path, &req_body).await?;
+        Ok(())
+    }
+
+    pub async fn timeline_create(
+        &self,
+        tenant_id: TenantId,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+}

From 56f7d55ba7f1aaab6f997754b29c27f0a16dff94 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 17:43:02 +0000
Subject: [PATCH 109/202] pageserver: basic cancel/timeout for remote storage
 operations (#6097)

## Problem

Various places in remote storage were not subject to a timeout (thereby
stuck TCP connections could hold things up), and did not respect a
cancellation token (so things like timeline deletion or tenant detach
would have to wait arbitrarily long).



## Summary of changes

- Add download_cancellable and upload_cancellable helpers, and use them
in all the places we wait for remote storage operations (with the
exception of initdb downloads, where it would not have been safe).
- Add a cancellation token arg to `download_retry`.
- Use cancellation token args in various places that were missing one
per #5066

Closes: #5066

Why is this only "basic" handling?
- Doesn't express difference between shutdown and errors in return
types, to avoid refactoring all the places that use an anyhow::Error
(these should all eventually return a more structured error type)
- Implements timeouts on top of remote storage, rather than within it:
this means that operations hitting their timeout will lose their
semaphore permit and thereby go to the back of the queue for their
retry.
- Doing a nicer job is tracked in
https://github.com/neondatabase/neon/issues/6096
---
 libs/utils/src/timeout.rs                     |  3 +
 pageserver/src/tenant.rs                      |  6 +-
 pageserver/src/tenant/delete.rs               | 30 +++++--
 .../src/tenant/remote_timeline_client.rs      | 53 +++++++++++-
 .../tenant/remote_timeline_client/download.rs | 86 ++++++++++++++-----
 .../tenant/remote_timeline_client/upload.rs   | 30 ++++---
 pageserver/src/tenant/storage_layer/layer.rs  |  1 +
 .../fixtures/pageserver/allowed_errors.py     |  3 +
 8 files changed, 166 insertions(+), 46 deletions(-)

diff --git a/libs/utils/src/timeout.rs b/libs/utils/src/timeout.rs
index 11fa417242..56bf57a900 100644
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -2,8 +2,11 @@ use std::time::Duration;
 
 use tokio_util::sync::CancellationToken;
 
+#[derive(thiserror::Error, Debug)]
 pub enum TimeoutCancellableError {
+    #[error("Timed out")]
     Timeout,
+    #[error("Cancelled")]
     Cancelled,
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ace20e44e2..0d1be33331 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3059,6 +3059,7 @@ impl Tenant {
                     storage,
                     &self.tenant_shard_id,
                     &existing_initdb_timeline_id,
+                    &self.cancel,
                 )
                 .await
                 .context("download initdb tar")?;
@@ -3099,6 +3100,7 @@ impl Tenant {
                             &timeline_id,
                             pgdata_zstd.try_clone().await?,
                             tar_zst_size,
+                            &self.cancel,
                         )
                         .await
                     },
@@ -3106,9 +3108,7 @@ impl Tenant {
                     3,
                     u32::MAX,
                     "persist_initdb_tar_zst",
-                    backoff::Cancel::new(self.cancel.clone(), || {
-                        anyhow::anyhow!("initdb upload cancelled")
-                    }),
+                    backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
                 )
                 .await?;
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index acd311ace6..e8491f26db 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -71,6 +71,7 @@ async fn create_remote_delete_mark(
     conf: &PageServerConf,
     remote_storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
     let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
 
@@ -87,8 +88,7 @@ async fn create_remote_delete_mark(
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "mark_upload",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
     )
     .await
     .context("mark_upload")?;
@@ -170,6 +170,7 @@ async fn remove_tenant_remote_delete_mark(
     conf: &PageServerConf,
     remote_storage: Option<&GenericRemoteStorage>,
     tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
     if let Some(remote_storage) = remote_storage {
         let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
@@ -179,8 +180,7 @@ async fn remove_tenant_remote_delete_mark(
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "remove_tenant_remote_delete_mark",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
         )
         .await
         .context("remove_tenant_remote_delete_mark")?;
@@ -322,9 +322,15 @@ impl DeleteTenantFlow {
         // Though sounds scary, different mark name?
         // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
         if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
-                .await
-                .context("remote_mark")?
+            create_remote_delete_mark(
+                conf,
+                remote_storage,
+                &tenant.tenant_shard_id,
+                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+                &CancellationToken::new(),
+            )
+            .await
+            .context("remote_mark")?
         }
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -524,8 +530,14 @@ impl DeleteTenantFlow {
                 .context("timelines dir not empty")?;
         }
 
-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
-            .await?;
+        remove_tenant_remote_delete_mark(
+            conf,
+            remote_storage.as_ref(),
+            &tenant.tenant_shard_id,
+            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+            &CancellationToken::new(),
+        )
+        .await?;
 
         fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
             Err(anyhow::anyhow!(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4b271a7395..52ee8f49ce 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,10 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
+use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
+use std::time::Duration;
 
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
@@ -316,6 +318,47 @@ pub struct RemoteTimelineClient {
     storage_impl: GenericRemoteStorage,
 
     deletion_queue_client: DeletionQueueClient,
+
+    cancel: CancellationToken,
+}
+
+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
+/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
+///
+/// This is a convenience for the various upload functions.  In future
+/// the anyhow::Error result should be replaced with a more structured type that
+/// enables callers to avoid handling shutdown as an error.
+async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
+where
+    F: std::future::Future<Output = anyhow::Result<()>>,
+{
+    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
+        Ok(Ok(())) => Ok(()),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
+        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
+    }
+}
+/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
+async fn download_cancellable<F, R>(
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<R, DownloadError>
+where
+    F: std::future::Future<Output = Result<R, DownloadError>>,
+{
+    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
+        Ok(Ok(r)) => Ok(r),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => {
+            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
+        }
+        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
+    }
 }
 
 impl RemoteTimelineClient {
@@ -351,6 +394,7 @@ impl RemoteTimelineClient {
                 &tenant_shard_id,
                 &timeline_id,
             )),
+            cancel: CancellationToken::new(),
         }
     }
 
@@ -501,6 +545,7 @@ impl RemoteTimelineClient {
         &self,
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -517,6 +562,7 @@ impl RemoteTimelineClient {
                 self.timeline_id,
                 layer_file_name,
                 layer_metadata,
+                cancel,
             )
             .measure_remote_op(
                 self.tenant_shard_id.tenant_id,
@@ -971,6 +1017,7 @@ impl RemoteTimelineClient {
                     &self.timeline_id,
                     self.generation,
                     &index_part_with_deleted_at,
+                    &self.cancel,
                 )
             },
             |_e| false,
@@ -980,8 +1027,7 @@ impl RemoteTimelineClient {
             // when executed as part of tenant deletion this happens in the background
             2,
             "persist_index_part_with_deleted_flag",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
         )
         .await?;
 
@@ -1281,6 +1327,7 @@ impl RemoteTimelineClient {
                         path,
                         layer_metadata,
                         self.generation,
+                        &self.cancel,
                     )
                     .measure_remote_op(
                         self.tenant_shard_id.tenant_id,
@@ -1307,6 +1354,7 @@ impl RemoteTimelineClient {
                         &self.timeline_id,
                         self.generation,
                         index_part,
+                        &self.cancel,
                     )
                     .measure_remote_op(
                         self.tenant_shard_id.tenant_id,
@@ -1828,6 +1876,7 @@ mod tests {
                     &self.harness.tenant_shard_id,
                     &TIMELINE_ID,
                 )),
+                cancel: CancellationToken::new(),
             })
         }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index ed32c4eed9..d3956163c8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,7 +5,6 @@
 
 use std::collections::HashSet;
 use std::future::Future;
-use std::time::Duration;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -14,13 +13,17 @@ use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
+use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
-use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
+use crate::tenant::remote_timeline_client::{
+    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
+};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
+use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -32,8 +35,6 @@ use super::{
     FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
-static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
-
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -46,6 +47,7 @@ pub async fn download_layer_file<'a>(
     timeline_id: TimelineId,
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
+    cancel: &CancellationToken,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -73,14 +75,18 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
+    let cancel_inner = cancel.clone();
     let (mut destination_file, bytes_amount) = download_retry(
         || async {
             let destination_file = tokio::fs::File::create(&temp_file_path)
                 .await
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
-            let download = storage
-                .download(&remote_path)
+
+            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
+            // file: the write to local file doesn't start until after the request header is returned
+            // and we start draining the body stream below
+            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
                 .await
                 .with_context(|| {
                     format!(
@@ -94,12 +100,33 @@ pub async fn download_layer_file<'a>(
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-            let bytes_amount = tokio::time::timeout(
-                MAX_DOWNLOAD_DURATION,
+            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
+            // and we will unlink the temporary file if there is an error.  This unlink is important because we
+            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
+            // we will imminiently try and write to again.
+            let bytes_amount: u64 = match timeout_cancellable(
+                DOWNLOAD_TIMEOUT,
+                &cancel_inner,
                 tokio::io::copy_buf(&mut reader, &mut destination_file),
             )
             .await
-            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+            .with_context(|| {
+                format!(
+                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                )
+            })
+            .map_err(DownloadError::Other)?
+            {
+                Ok(b) => Ok(b),
+                Err(e) => {
+                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
+                    // do it here for the retry case.
+                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
+                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
+                    }
+                    Err(e)
+                }
+            }
             .with_context(|| {
                 format!(
                     "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
@@ -112,6 +139,7 @@ pub async fn download_layer_file<'a>(
             Ok((destination_file, bytes_amount))
         },
         &format!("download {remote_path:?}"),
+        cancel,
     )
     .await?;
 
@@ -188,8 +216,14 @@ pub async fn list_remote_timelines(
         anyhow::bail!("storage-sync-list-remote-timelines");
     });
 
+    let cancel_inner = cancel.clone();
     let listing = download_retry_forever(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+        || {
+            download_cancellable(
+                &cancel_inner,
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+            )
+        },
         &format!("list timelines for {tenant_shard_id}"),
         cancel,
     )
@@ -230,9 +264,13 @@ async fn do_download_index_part(
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
+    let cancel_inner = cancel.clone();
     let index_part_bytes = download_retry_forever(
         || async {
-            let index_part_download = storage.download(&remote_path).await?;
+            // Cancellation: if is safe to cancel this future because we're just downloading into
+            // a memory buffer, not touching local disk.
+            let index_part_download =
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
 
             let mut index_part_bytes = Vec::new();
             let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -347,10 +385,7 @@ pub(super) async fn download_index_part(
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "listing index_part files",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
-            unreachable!()
-        }),
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
     )
     .await
     .map_err(DownloadError::Other)?;
@@ -389,6 +424,7 @@ pub(crate) async fn download_initdb_tar_zst(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
+    cancel: &CancellationToken,
 ) -> Result<(Utf8PathBuf, File), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -406,6 +442,8 @@ pub(crate) async fn download_initdb_tar_zst(
         "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
     ));
 
+    let cancel_inner = cancel.clone();
+
     let file = download_retry(
         || async {
             let file = OpenOptions::new()
@@ -418,10 +456,14 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download = storage.download(&remote_path).await?;
+            let download =
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
             let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
 
+            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
+            // not without thinking carefully about how to recover safely from cancelling a write to
+            // local storage (e.g. by writing into a temp file as we do in download_layer)
             tokio::io::copy_buf(&mut download, &mut writer)
                 .await
                 .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
@@ -437,6 +479,7 @@ pub(crate) async fn download_initdb_tar_zst(
             Ok(file)
         },
         &format!("download {remote_path}"),
+        cancel,
     )
     .await
     .map_err(|e| {
@@ -460,7 +503,11 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(
+    op: O,
+    description: &str,
+    cancel: &CancellationToken,
+) -> Result<T, DownloadError>
 where
     O: FnMut() -> F,
     F: Future<Output = Result<T, DownloadError>>,
@@ -471,10 +518,7 @@ where
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         description,
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
-            unreachable!()
-        }),
+        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
     )
     .await
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 9f5b3c242b..11c6956875 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -7,12 +7,14 @@ use pageserver_api::shard::TenantShardId;
 use std::io::{ErrorKind, SeekFrom};
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
+use tokio_util::sync::CancellationToken;
 
 use super::Generation;
 use crate::{
     config::PageServerConf,
     tenant::remote_timeline_client::{
         index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
+        upload_cancellable,
     },
 };
 use remote_storage::GenericRemoteStorage;
@@ -29,6 +31,7 @@ pub(super) async fn upload_index_part<'a>(
     timeline_id: &TimelineId,
     generation: Generation,
     index_part: &'a IndexPart,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading new index part");
 
@@ -44,14 +47,16 @@ pub(super) async fn upload_index_part<'a>(
     let index_part_bytes = bytes::Bytes::from(index_part_bytes);
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    storage
-        .upload_storage_object(
+    upload_cancellable(
+        cancel,
+        storage.upload_storage_object(
             futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
             index_part_size,
             &remote_path,
-        )
-        .await
-        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+        ),
+    )
+    .await
+    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
 
 /// Attempts to upload given layer files.
@@ -64,6 +69,7 @@ pub(super) async fn upload_timeline_layer<'a>(
     source_path: &'a Utf8Path,
     known_metadata: &'a LayerFileMetadata,
     generation: Generation,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     fail_point!("before-upload-layer", |_| {
         bail!("failpoint before-upload-layer")
@@ -107,8 +113,7 @@ pub(super) async fn upload_timeline_layer<'a>(
 
     let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
-    storage
-        .upload(reader, fs_size, &storage_path, None)
+    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
         .await
         .with_context(|| format!("upload layer from local path '{source_path}'"))?;
 
@@ -122,6 +127,7 @@ pub(crate) async fn upload_initdb_dir(
     timeline_id: &TimelineId,
     mut initdb_tar_zst: File,
     size: u64,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading initdb dir");
 
@@ -131,8 +137,10 @@ pub(crate) async fn upload_initdb_dir(
     let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    storage
-        .upload_storage_object(file, size as usize, &remote_path)
-        .await
-        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    upload_cancellable(
+        cancel,
+        storage.upload_storage_object(file, size as usize, &remote_path),
+    )
+    .await
+    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 69a2893456..a4b102c314 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -862,6 +862,7 @@ impl LayerInner {
                 let result = client.download_layer_file(
                     &this.desc.filename(),
                     &this.metadata(),
+                    &crate::task_mgr::shutdown_token()
                 )
                 .await;
 
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 5c3ae3ce4b..74c6bddf23 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -79,6 +79,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
     # and it is not a failure of our code when it happens.
     ".*DeleteObjects.*We encountered an internal error. Please try again.*",
+    # During shutdown, DownloadError::Cancelled may be logged as an error.  Cleaning this
+    # up is tracked in https://github.com/neondatabase/neon/issues/6096
+    ".*Cancelled, shutting down.*",
 )
 
 

From d066dad84b076daf3781cdf9a692098889d3974e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 20:37:47 +0000
Subject: [PATCH 110/202] pageserver: prioritize activation of tenants with
 client requests (#6112)

## Problem

During startup, a client request might have to wait a long time while
the system is busy initializing all the attached tenants, even though
most of the attached tenants probably don't have any client requests to
service, and could wait a bit.

## Summary of changes

- Add a semaphore to limit how many Tenant::spawn()s may concurrently do
I/O to attach their tenant (i.e. read indices from remote storage, scan
local layer files, etc).
- Add Tenant::activate_now, a hook for kicking a tenant in its spawn()
method to skip waiting for the warmup semaphore
- For tenants that attached via warmup semaphore units, wait for logical
size calculation to complete before dropping the warmup units
- Set Tenant::activate_now in `get_active_tenant_with_timeout` (the page
service's path for getting a reference to a tenant).
- Wait for tenant activation in HTTP handlers for timeline creation and
deletion: like page service requests, these require an active tenant and
should prioritize activation if called.
---
 pageserver/src/config.rs                      |  36 +++++
 pageserver/src/http/routes.rs                 |  47 +++++-
 pageserver/src/metrics.rs                     |  51 ++++++-
 pageserver/src/tenant.rs                      | 142 ++++++++++++++++--
 pageserver/src/tenant/mgr.rs                  |  52 ++-----
 pageserver/src/tenant/timeline.rs             |  32 ++++
 .../src/tenant/timeline/logical_size.rs       |   5 +
 test_runner/regress/test_timeline_size.py     | 142 +++++++++++++++++-
 8 files changed, 450 insertions(+), 57 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index cd99cda783..bd63c4d860 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -41,6 +41,8 @@ use crate::{
     TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
+use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
+
 pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
@@ -61,6 +63,8 @@ pub mod defaults {
 
     pub const DEFAULT_LOG_FORMAT: &str = "plain";
 
+    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
+
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
@@ -94,6 +98,7 @@ pub mod defaults {
 #log_format = '{DEFAULT_LOG_FORMAT}'
 
 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
+#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
 
 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
@@ -180,6 +185,11 @@ pub struct PageServerConf {
 
     pub log_format: LogFormat,
 
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
+    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
+    /// loading such tenants, vs. other work in the system.
+    pub concurrent_tenant_warmup: ConfigurableSemaphore,
+
     /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
     /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
@@ -283,6 +293,7 @@ struct PageServerConfigBuilder {
 
     log_format: BuilderValue<LogFormat>,
 
+    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
     concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
 
     metric_collection_interval: BuilderValue<Duration>,
@@ -340,6 +351,8 @@ impl Default for PageServerConfigBuilder {
             .expect("cannot parse default keepalive interval")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
+            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                .expect("Invalid default constant")),
             concurrent_tenant_size_logical_size_queries: Set(
                 ConfigurableSemaphore::DEFAULT_INITIAL,
             ),
@@ -453,6 +466,10 @@ impl PageServerConfigBuilder {
         self.log_format = BuilderValue::Set(log_format)
     }
 
+    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
+        self.concurrent_tenant_warmup = BuilderValue::Set(u);
+    }
+
     pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
@@ -518,6 +535,9 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
+        let concurrent_tenant_warmup = self
+            .concurrent_tenant_warmup
+            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
         let concurrent_tenant_size_logical_size_queries = self
             .concurrent_tenant_size_logical_size_queries
             .ok_or(anyhow!(
@@ -570,6 +590,7 @@ impl PageServerConfigBuilder {
                 .broker_keepalive_interval
                 .ok_or(anyhow!("No broker keepalive interval provided"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
                 concurrent_tenant_size_logical_size_queries,
             ),
@@ -807,6 +828,11 @@ impl PageServerConf {
                 "log_format" => builder.log_format(
                     LogFormat::from_config(&parse_toml_string(key, item)?)?
                 ),
+                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
+                    let input = parse_toml_string(key, item)?;
+                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
+                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
+                }),
                 "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
                     let input = parse_toml_string(key, item)?;
                     let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
@@ -904,6 +930,10 @@ impl PageServerConf {
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                    .expect("Invalid default constant"),
+            ),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
             ),
@@ -1122,6 +1152,9 @@ background_task_maximum_delay = '334 s'
                     storage_broker::DEFAULT_KEEPALIVE_INTERVAL
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+                concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
+                ),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                 eviction_task_immitated_concurrent_logical_size_queries:
                     ConfigurableSemaphore::default(),
@@ -1188,6 +1221,9 @@ background_task_maximum_delay = '334 s'
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
+                concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
+                ),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                 eviction_task_immitated_concurrent_logical_size_queries:
                     ConfigurableSemaphore::default(),
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 102c9d9a6a..47092c3bb0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -38,6 +38,7 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
+use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
     GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
     TenantSlotError, TenantSlotUpsertError, TenantStateError,
@@ -67,6 +68,11 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;
 
+// For APIs that require an Active tenant, how long should we block waiting for that state?
+// This is not functionally necessary (clients will retry), but avoids generating a lot of
+// failed API calls while tenants are activating.
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+
 pub struct State {
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
@@ -233,6 +239,19 @@ impl From<GetTenantError> for ApiError {
     }
 }
 
+impl From<GetActiveTenantError> for ApiError {
+    fn from(e: GetActiveTenantError) -> ApiError {
+        match e {
+            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
+            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
+            GetActiveTenantError::NotFound(gte) => gte.into(),
+            GetActiveTenantError::WaitForActiveTimeout { .. } => {
+                ApiError::ResourceUnavailable(format!("{}", e).into())
+            }
+        }
+    }
+}
+
 impl From<SetNewTenantConfigError> for ApiError {
     fn from(e: SetNewTenantConfigError) -> ApiError {
         match e {
@@ -435,7 +454,10 @@ async fn timeline_create_handler(
     let state = get_state(&request);
 
     async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
         match tenant.create_timeline(
             new_timeline_id,
             request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -694,11 +716,23 @@ async fn timeline_delete_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
     let state = get_state(&request);
 
-    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id, false)
+        .map_err(|e| {
+            match e {
+                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
+                // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
+                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
+                    "Requested tenant is missing".to_string().into_boxed_str(),
+                ),
+                e => e.into(),
+            }
+        })?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
         .await?;
 
     json_response(StatusCode::ACCEPTED, ())
@@ -1136,7 +1170,10 @@ async fn tenant_create_handler(
 
     // We created the tenant. Existing API semantics are that the tenant
     // is Active when this function returns.
-    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+    if let res @ Err(_) = new_tenant
+        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+        .await
+    {
         // This shouldn't happen because we just created the tenant directory
         // in tenant::mgr::create_tenant, and there aren't any remote timelines
         // to load, so, nothing can really fail during load.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ba6fd00bd1..45c01b71d1 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -684,14 +684,54 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
     .expect("Failed to register pageserver_startup_is_loading")
 });
 
-/// How long did tenants take to go from construction to active state?
-pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
+/// like how long it took to load.
+///
+/// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
+/// metrics are rather expensive, and usually fine grained stuff makes more sense
+/// at a timeline level than tenant level.
+pub(crate) struct TenantMetrics {
+    /// How long did tenants take to go from construction to active state?
+    pub(crate) activation: Histogram,
+    pub(crate) preload: Histogram,
+    pub(crate) attach: Histogram,
+
+    /// How many tenants are included in the initial startup of the pagesrever?
+    pub(crate) startup_scheduled: IntCounter,
+    pub(crate) startup_complete: IntCounter,
+}
+
+pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
+    TenantMetrics {
+    activation: register_histogram!(
         "pageserver_tenant_activation_seconds",
         "Time taken by tenants to activate, in seconds",
         CRITICAL_OP_BUCKETS.into()
     )
-    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+    .expect("Failed to register metric"),
+    preload: register_histogram!(
+        "pageserver_tenant_preload_seconds",
+        "Time taken by tenants to load remote metadata on startup/attach, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register metric"),
+    attach: register_histogram!(
+        "pageserver_tenant_attach_seconds",
+        "Time taken by tenants to intialize, after remote metadata is already loaded",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register metric"),
+    startup_scheduled: register_int_counter!(
+        "pageserver_tenant_startup_scheduled",
+        "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
+    ).expect("Failed to register metric"),
+    startup_complete: register_int_counter!(
+        "pageserver_tenant_startup_complete",
+        "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
+         should eventually reach `pageserver_tenant_startup_scheduled_total`.  Does not include broken \
+         tenants: such cases will lead to this metric never reaching the scheduled count."
+    ).expect("Failed to register metric"),
+}
 });
 
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -2213,6 +2253,9 @@ pub fn preinitialize_metrics() {
     // Deletion queue stats
     Lazy::force(&DELETION_QUEUE);
 
+    // Tenant stats
+    Lazy::force(&TENANT);
+
     // Tenant manager stats
     Lazy::force(&TENANT_MANAGER);
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0d1be33331..1478a1a445 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -36,6 +36,8 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
+use utils::timeout::timeout_cancellable;
+use utils::timeout::TimeoutCancellableError;
 
 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -59,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::TENANT_ACTIVATION;
+use crate::metrics::TENANT;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -226,7 +228,7 @@ pub struct Tenant {
 
     /// The value creation timestamp, used to measure activation delay, see:
     /// <https://github.com/neondatabase/neon/issues/4025>
-    loading_started_at: Instant,
+    constructed_at: Instant,
 
     state: watch::Sender<TenantState>,
 
@@ -276,6 +278,11 @@ pub struct Tenant {
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 
+    /// If the tenant is in Activating state, notify this to encourage it
+    /// to proceed to Active as soon as possible, rather than waiting for lazy
+    /// background warmup.
+    pub(crate) activate_now_sem: tokio::sync::Semaphore,
+
     pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 
     // Cancellation token fires when we have entered shutdown().  This is a parent of
@@ -622,6 +629,11 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                scopeguard::defer! {
+                    tracing::info!("Increment complete count");
+                    TENANT.startup_complete.inc();
+                }
+
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
                 let make_broken =
                     |t: &Tenant, err: anyhow::Error| {
@@ -648,6 +660,56 @@ impl Tenant {
                     .as_mut()
                     .and_then(|x| x.initial_tenant_load_remote.take());
 
+                enum AttachType<'a> {
+                    // During pageserver startup, we are attaching this tenant lazily in the background
+                    Warmup(tokio::sync::SemaphorePermit<'a>),
+                    // During pageserver startup, we are attaching this tenant as soon as we can,
+                    // because a client tried to access it.
+                    OnDemand,
+                    // During normal operations after startup, we are attaching a tenant.
+                    Normal,
+                }
+
+                // Before doing any I/O, wait for either or:
+                // - A client to attempt to access to this tenant (on-demand loading)
+                // - A permit to become available in the warmup semaphore (background warmup)
+                //
+                // Some-ness of init_order is how we know if we're attaching during startup or later
+                // in process lifetime.
+                let attach_type = if init_order.is_some() {
+                    tokio::select!(
+                        _ = tenant_clone.activate_now_sem.acquire() => {
+                            tracing::info!("Activating tenant (on-demand)");
+                            AttachType::OnDemand
+                        },
+                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            match permit_result {
+                                Ok(p) => {
+                                    tracing::info!("Activating tenant (warmup)");
+                                    AttachType::Warmup(p)
+                                }
+                                Err(_) => {
+                                    // This is unexpected: the warmup semaphore should stay alive
+                                    // for the lifetime of init_order.  Log a warning and proceed.
+                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
+                                    AttachType::Normal
+                                }
+                            }
+
+                        }
+                        _ = tenant_clone.cancel.cancelled() => {
+                            // This is safe, but should be pretty rare: it is interesting if a tenant
+                            // stayed in Activating for such a long time that shutdown found it in
+                            // that state.
+                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            return Ok(());
+                        },
+                    )
+                } else {
+                    AttachType::Normal
+                };
+
+                let preload_timer = TENANT.preload.start_timer();
                 let preload = match mode {
                     SpawnMode::Create => {None},
                     SpawnMode::Normal => {
@@ -670,6 +732,7 @@ impl Tenant {
                         }
                     }
                 };
+                preload_timer.observe_duration();
 
                 // Remote preload is complete.
                 drop(remote_load_completion);
@@ -721,15 +784,39 @@ impl Tenant {
                     }
                 }
 
+                let attach_timer = TENANT.attach.start_timer();
                 match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
+                        attach_timer.observe_duration();
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
+                        attach_timer.observe_duration();
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }
+
+                // If we are doing an opportunistic warmup attachment at startup, initialize
+                // logical size at the same time.  This is better than starting a bunch of idle tenants
+                // with cold caches and then coming back later to initialize their logical sizes.
+                //
+                // It also prevents the warmup proccess competing with the concurrency limit on
+                // logical size calculations: if logical size calculation semaphore is saturated,
+                // then warmup will wait for that before proceeding to the next tenant.
+                if let AttachType::Warmup(_permit) = attach_type {
+                    let mut futs = FuturesUnordered::new();
+                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
+                    for t in timelines {
+                        futs.push(t.await_initial_logical_size())
+                    }
+                    tracing::info!("Waiting for initial logical sizes while warming up...");
+                    while futs.next().await.is_some() {
+
+                    }
+                    tracing::info!("Warm-up complete");
+                }
+
                 Ok(())
             }
             .instrument({
@@ -1696,6 +1783,15 @@ impl Tenant {
         Ok(loaded_timeline)
     }
 
+    pub(crate) async fn delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+
+        Ok(())
+    }
+
     /// perform one garbage collection iteration, removing old data files from disk.
     /// this function is periodically called by gc task.
     /// also it can be explicitly requested through page server api 'do_gc' command.
@@ -1857,7 +1953,7 @@ impl Tenant {
                 );
                 *current_state = TenantState::Active;
 
-                let elapsed = self.loading_started_at.elapsed();
+                let elapsed = self.constructed_at.elapsed();
                 let total_timelines = timelines_accessor.len();
 
                 // log a lot of stuff, because some tenants sometimes suffer from user-visible
@@ -1872,7 +1968,7 @@ impl Tenant {
                     "activation attempt finished"
                 );
 
-                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
+                TENANT.activation.observe(elapsed.as_secs_f64());
             });
         }
     }
@@ -2127,18 +2223,41 @@ impl Tenant {
         self.state.subscribe()
     }
 
-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
+    /// The activate_now semaphore is initialized with zero units.  As soon as
+    /// we add a unit, waiters will be able to acquire a unit and proceed.
+    pub(crate) fn activate_now(&self) {
+        self.activate_now_sem.add_permits(1);
+    }
+
+    pub(crate) async fn wait_to_become_active(
+        &self,
+        timeout: Duration,
+    ) -> Result<(), GetActiveTenantError> {
         let mut receiver = self.state.subscribe();
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
                 TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                     // in these states, there's a chance that we can reach ::Active
-                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError|
-                            // Tenant existed but was dropped: report it as non-existent
-                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
-                    )?;
+                    self.activate_now();
+                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
+                        Ok(r) => {
+                            r.map_err(
+                            |_e: tokio::sync::watch::error::RecvError|
+                                // Tenant existed but was dropped: report it as non-existent
+                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                        )?
+                        }
+                        Err(TimeoutCancellableError::Cancelled) => {
+                            return Err(GetActiveTenantError::Cancelled);
+                        }
+                        Err(TimeoutCancellableError::Timeout) => {
+                            return Err(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: Some(self.current_state()),
+                                wait_time: timeout,
+                            });
+                        }
+                    }
                 }
                 TenantState::Active { .. } => {
                     return Ok(());
@@ -2463,7 +2582,7 @@ impl Tenant {
             conf,
             // using now here is good enough approximation to catch tenants with really long
             // activation times.
-            loading_started_at: Instant::now(),
+            constructed_at: Instant::now(),
             tenant_conf: Arc::new(RwLock::new(attached_conf)),
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
@@ -2475,6 +2594,7 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            activate_now_sem: tokio::sync::Semaphore::new(0),
             delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
             gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f53951e1d3..b2f14db9f7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -28,7 +28,7 @@ use crate::control_plane_client::{
     ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::TENANT_MANAGER as METRICS;
+use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
@@ -44,7 +44,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
-use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -430,6 +429,13 @@ pub async fn init_tenant_mgr(
     let tenant_generations =
         init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
 
+    tracing::info!(
+        "Attaching {} tenants at startup, warming up {} at a time",
+        tenant_configs.len(),
+        conf.concurrent_tenant_warmup.initial_permits()
+    );
+    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
+
     // Construct `Tenant` objects and start them running
     for (tenant_shard_id, location_conf) in tenant_configs {
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -848,17 +854,6 @@ impl TenantManager {
         }
     }
 
-    pub(crate) async fn delete_timeline(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        _ctx: &RequestContext,
-    ) -> Result<(), DeleteTimelineError> {
-        let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
-        DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
-        Ok(())
-    }
-
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
@@ -1221,7 +1216,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
                         // Fast path: we don't need to do any async waiting.
                         return Ok(tenant.clone());
                     }
-                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
+                    _ => {
+                        tenant.activate_now();
+                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
+                    }
                 }
             }
             Some(TenantSlot::Secondary) => {
@@ -1275,28 +1273,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
     };
 
     tracing::debug!("Waiting for tenant to enter active state...");
-    match timeout_cancellable(
-        deadline.duration_since(Instant::now()),
-        cancel,
-        tenant.wait_to_become_active(),
-    )
-    .await
-    {
-        Ok(Ok(())) => Ok(tenant),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            let latest_state = tenant.current_state();
-            if latest_state == TenantState::Active {
-                Ok(tenant)
-            } else {
-                Err(GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state: Some(latest_state),
-                    wait_time: timeout,
-                })
-            }
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
-    }
+    tenant
+        .wait_to_become_active(deadline.duration_since(Instant::now()))
+        .await?;
+    Ok(tenant)
 }
 
 pub(crate) async fn delete_tenant(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7438215a68..1e84fa1848 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1734,6 +1734,7 @@ impl Timeline {
                 self.current_logical_size.current_size().accuracy(),
                 logical_size::Accuracy::Exact,
             );
+            self.current_logical_size.initialized.add_permits(1);
             return;
         };
 
@@ -1779,6 +1780,11 @@ impl Timeline {
         cancel: CancellationToken,
         background_ctx: RequestContext,
     ) {
+        scopeguard::defer! {
+            // Irrespective of the outcome of this operation, we should unblock anyone waiting for it.
+            self.current_logical_size.initialized.add_permits(1);
+        }
+
         enum BackgroundCalculationError {
             Cancelled,
             Other(anyhow::Error),
@@ -3104,6 +3110,32 @@ impl Timeline {
 
         Ok(image_layers)
     }
+
+    /// Wait until the background initial logical size calculation is complete, or
+    /// this Timeline is shut down.  Calling this function will cause the initial
+    /// logical size calculation to skip waiting for the background jobs barrier.
+    pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if let Some(await_bg_cancel) = self
+            .current_logical_size
+            .cancel_wait_for_background_loop_concurrency_limit_semaphore
+            .get()
+        {
+            await_bg_cancel.cancel();
+        } else {
+            // We should not wait if we were not able to explicitly instruct
+            // the logical size cancellation to skip the concurrency limit semaphore.
+            // TODO: this is an unexpected case.  We should restructure so that it
+            // can't happen.
+            tracing::info!(
+                "await_initial_logical_size: can't get semaphore cancel token, skipping"
+            );
+        }
+
+        tokio::select!(
+            _ = self.current_logical_size.initialized.acquire() => {},
+            _ = self.cancel.cancelled() => {}
+        )
+    }
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index f2db8c91fc..03bc59ea38 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -34,6 +34,9 @@ pub(super) struct LogicalSize {
     pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
         OnceCell<CancellationToken>,
 
+    /// Once the initial logical size is initialized, this is notified.
+    pub(crate) initialized: tokio::sync::Semaphore,
+
     /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
     pub initial_part_end: Option<Lsn>,
 
@@ -125,6 +128,7 @@ impl LogicalSize {
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
             did_return_approximate_to_walreceiver: AtomicBool::new(false),
+            initialized: tokio::sync::Semaphore::new(0),
         }
     }
 
@@ -135,6 +139,7 @@ impl LogicalSize {
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
             did_return_approximate_to_walreceiver: AtomicBool::new(false),
+            initialized: tokio::sync::Semaphore::new(0),
         }
     }
 
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 24cbe34457..6e510b2eba 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -300,7 +300,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
 
     # load in some data
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -732,3 +733,142 @@ def wait_for_timeline_size_init(
     raise Exception(
         f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
     )
+
+
+def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
+    """
+    Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
+    before proceeding.  However, they skip this if a client is actively trying to access them.
+
+    This test is not purely about logical sizes, but logical size calculation is the phase that we
+    use as a proxy for "warming up" in this test: it happens within the semaphore guard used
+    to limit concurrent tenant warm-up.
+    """
+
+    # We will run with the limit set to 1, so that once we have one tenant stuck
+    # in a pausable failpoint, the rest are prevented from proceeding through warmup.
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # Create some tenants
+    n_tenants = 10
+    tenant_ids = {env.initial_tenant}
+    for _i in range(0, n_tenants - 1):
+        tenant_id = TenantId.generate()
+        env.pageserver.tenant_create(tenant_id)
+
+        # Empty tenants are not subject to waiting for logical size calculations, because
+        # those hapen on timeline level
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_timeline(
+            new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id
+        )
+
+        tenant_ids.add(tenant_id)
+
+    # Restart pageserver with logical size calculations paused
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    def get_tenant_states():
+        states = {}
+        for tenant_id in tenant_ids:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            states[tenant_id] = tenant["state"]["slug"]
+        log.info(f"Tenant states: {states}")
+        return states
+
+    def at_least_one_active():
+        assert "Active" in set(get_tenant_states().values())
+
+    # One tenant should activate, then get stuck in their logical size calculation
+    wait_until(10, 1, at_least_one_active)
+
+    # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
+    time.sleep(5)
+
+    # We should see one tenant win the activation race, and enter logical size calculation.  The rest
+    # will stay in Attaching state, waiting for the "warmup_limit" semaphore
+    expect_activated = 1
+    states = get_tenant_states()
+    assert len([s for s in states.values() if s == "Active"]) == expect_activated
+    assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
+    )
+
+    # This is zero, and subsequent checks are expect_activated - 1, because this counter does not
+    # count how may tenants are Active, it counts how many have finished warmup.  The first tenant
+    # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup.
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0
+
+    # If a client accesses one of the blocked tenants, it should skip waiting for warmup and
+    # go active as fast as it can.
+    stuck_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+
+    endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id)
+    endpoint.safe_psql_many(
+        [
+            "CREATE TABLE foo (x INTEGER)",
+            "INSERT INTO foo SELECT g FROM generate_series(1, 10) g",
+        ]
+    )
+    endpoint.stop()
+
+    # That one that we successfully accessed is now Active
+    expect_activated += 1
+    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )
+
+    # The ones we didn't touch are still in Attaching
+    assert (
+        len([s for s in get_tenant_states().values() if s == "Attaching"])
+        == n_tenants - expect_activated
+    )
+
+    # Timeline creation operations also wake up Attaching tenants
+    stuck_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+    pageserver_http.timeline_create(env.pg_version, stuck_tenant_id, TimelineId.generate())
+    expect_activated += 1
+    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
+    assert (
+        len([s for s in get_tenant_states().values() if s == "Attaching"])
+        == n_tenants - expect_activated
+    )
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )
+
+    # When we unblock logical size calculation, all tenants should proceed to active state via
+    # the warmup route.
+    pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+    def all_active():
+        assert all(s == "Active" for s in get_tenant_states().values())
+
+    wait_until(10, 1, all_active)
+
+    # Final control check: restarting with no failpoints at all results in all tenants coming active
+    # without being prompted by client I/O
+    env.pageserver.stop()
+    env.pageserver.start()
+    wait_until(10, 1, all_active)
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
+    )
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants

From 83811491da80794a6cd89827db080c79a83def2f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sat, 16 Dec 2023 09:06:00 +0000
Subject: [PATCH 111/202] update zerocopy (#6148)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

https://github.com/neondatabase/neon/security/dependabot/48

```
$ cargo tree -i zerocopy
zerocopy v0.7.3
└── ahash v0.8.5
    └── hashbrown v0.13.2
```

ahash doesn't use the affected APIs we we are not vulnerable but best to
update to silence the alert anyway

## Summary of changes

```
$ cargo update -p zerocopy --precise 0.7.31
    Updating crates.io index
    Updating syn v2.0.28 -> v2.0.32
    Updating zerocopy v0.7.3 -> v0.7.31
    Updating zerocopy-derive v0.7.3 -> v0.7.31
```
---
 Cargo.lock | 56 +++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 99a936dd40..96f6b494c4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -881,7 +881,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.28",
+ "syn 2.0.32",
  "which",
 ]
 
@@ -1095,7 +1095,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1486,7 +1486,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1497,7 +1497,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1572,7 +1572,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1666,7 +1666,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1920,7 +1920,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -2906,7 +2906,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -3349,7 +3349,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -3556,7 +3556,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -4164,7 +4164,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.28",
+ "syn 2.0.32",
  "unicode-ident",
 ]
 
@@ -4599,7 +4599,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -4680,7 +4680,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -4947,9 +4947,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.28"
+version = "2.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
+checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5079,7 +5079,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -5197,7 +5197,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -5498,7 +5498,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -5943,7 +5943,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
  "wasm-bindgen-shared",
 ]
 
@@ -5977,7 +5977,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -6314,7 +6314,7 @@ dependencies = [
  "smallvec",
  "subtle",
  "syn 1.0.109",
- "syn 2.0.28",
+ "syn 2.0.32",
  "time",
  "time-macros",
  "tokio",
@@ -6376,22 +6376,22 @@ dependencies = [
 
 [[package]]
 name = "zerocopy"
-version = "0.7.3"
+version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
+checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.7.3"
+version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
+checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]

From 8619e6295a8a32f20282cf8a264f1d1e019a2e05 Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Sat, 16 Dec 2023 10:33:52 +0000
Subject: [PATCH 112/202] CI: build build-tools image (#6082)

## Currently our build docker file is located in the build repo it makes
sense to have it as a part of our neon repo

## Summary of changes
We had the docker file that we use to build our binary and other tools
resided in the build repo
It made sense to bring the docker file to its repo where it has been
used
So that the contributors can also view it and amend if required
It will reduce the maintenance. Docker file changes and code changes can
be accommodated in same PR
Also, building the image and pushing it to ECR is abstracted in a
reusable workflow. Ideal is to use that for any other jobs too

## Checklist before requesting a review

- [x] Moved the docker file used to build the binary from the build repo
to the neon repo
- [x] adding gh workflow to build and push the image
- [x] adding gh workflow to tag the pushed image
- [x] update readMe file

---------

Co-authored-by: Abhijeet Patil <abhijeet@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../workflows/build_and_push_docker_image.yml |  57 ++++++
 .github/workflows/build_and_test.yml          |  82 +++++++--
 .../workflows/update_build_tools_image.yml    | 130 ++++++++++++++
 .gitignore                                    |   1 +
 CONTRIBUTING.md                               |  14 ++
 Dockerfile                                    |   2 +-
 Dockerfile.buildtools                         | 165 ++++++++++++++++++
 Dockerfile.compute-node                       |   2 +-
 Dockerfile.compute-tools                      |   2 +-
 9 files changed, 434 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/build_and_push_docker_image.yml
 create mode 100644 .github/workflows/update_build_tools_image.yml
 create mode 100644 Dockerfile.buildtools

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
new file mode 100644
index 0000000000..8780c0fef7
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,57 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+      tag:
+        required: true
+        type: string
+
+jobs:
+  kaniko:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64
+
+  kaniko-arm:
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
+
+  manifest:
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - kaniko
+      - kaniko-arm
+
+    steps:
+      - name: Create manifest
+        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 693ed1a66f..808d39b33c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,13 +44,13 @@ jobs:
 
         exit 1
 
-
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
 
     steps:
       - name: Checkout
@@ -74,11 +74,50 @@ jobs:
         shell: bash
         id: build-tag
 
-  check-codestyle-python:
+      - name: Get buildtools tag
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+          
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  check-if-build-tools-dockerfile-changed:
     needs: [ check-permissions ]
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  build-buildtools-image:
+    needs: [ check-permissions, check-if-build-tools-dockerfile-changed, tag]
+    uses: ./.github/workflows/build_and_push_docker_image.yml
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    with:
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
+      tag: ${{ needs.tag.outputs.build-tools-tag}}
+    secrets: inherit
+
+  check-codestyle-python:
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image , tag ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -108,10 +147,11 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -175,10 +215,11 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, tag, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -411,7 +452,7 @@ jobs:
     needs: [ check-permissions, build-neon, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -447,10 +488,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, tag]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -479,12 +520,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, tag ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -526,11 +567,10 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests ]
-
+    needs: [ check-permissions, regress-tests, tag ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -694,7 +734,8 @@ jobs:
             }"
 
   neon-image:
-    needs: [ check-permissions, tag ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -733,6 +774,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+                           --build-arg TAG=${{ needs.tag.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -742,8 +784,9 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-tools-image:
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -778,6 +821,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -788,7 +832,8 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    needs: [ check-permissions, tag ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -836,6 +881,7 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
new file mode 100644
index 0000000000..88bab797b7
--- /dev/null
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,130 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+    outputs:
+      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
+      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Get source image digest
+        id: next-digest
+        run: |
+          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
+            exit 1
+          fi
+
+          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
+          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Get destination image digest (if already exists)
+        id: prev-digest
+        run: |
+          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
+          if [ -z "${PREV_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
+          else
+            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
+
+            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Tag image
+        run: |
+          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
+
+  rollback-tag-image:
+    needs:  tag-image
+    if: ${{ !success() }}
+
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Restore previous tag if needed
+        run: |
+          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
+          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
+            exit 0
+          fi
+
+          if [ -z "${PREV_DIGEST}" ]; then
+            # I guess we should delete the tag here/untag the image, but crane does not support it
+            # - https://github.com/google/go-containerregistry/issues/999
+
+            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
+
+            exit 0
+          fi
+
+          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
+          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
+            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
+
+            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
+          else
+            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
+          fi
diff --git a/.gitignore b/.gitignore
index c5fc121ac2..3f4495c9e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 test_output/
 .vscode
 .idea
+neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2692684006..b318c295a3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,3 +70,17 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
+
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:
+
+```bash
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 60de9cfa3e..5d5fde4f14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
new file mode 100644
index 0000000000..d3d05b4e20
--- /dev/null
+++ b/Dockerfile.buildtools
@@ -0,0 +1,165 @@
+FROM debian:bullseye-slim
+
+# Add nonroot user
+RUN useradd -ms /bin/bash nonroot -b /home
+SHELL ["/bin/bash", "-c"]
+
+# System deps
+RUN set -e \
+    && apt update \
+    && apt install -y \
+        autoconf \
+        automake \
+        bison \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        flex \
+        git \
+        gnupg \
+        gzip \
+        jq \
+        libcurl4-openssl-dev \
+        libbz2-dev \
+        libffi-dev \
+        liblzma-dev \
+        libncurses5-dev \
+        libncursesw5-dev \
+        libpq-dev \
+        libreadline-dev \
+        libseccomp-dev \
+        libsqlite3-dev \
+        libssl-dev \
+        libstdc++-10-dev \
+        libtool \
+        libxml2-dev \
+        libxmlsec1-dev \
+        libxxhash-dev \
+        lsof \
+        make \
+        netcat \
+        net-tools \
+        openssh-client \
+        parallel \
+        pkg-config \
+        unzip \
+        wget \
+        xz-utils \
+        zlib1g-dev \
+        zstd \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# protobuf-compiler (protoc)
+ENV PROTOC_VERSION 22.2
+RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc
+
+# LLVM
+ENV LLVM_VERSION=17
+RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
+    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && apt update \
+    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
+    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# PostgreSQL 14
+RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
+    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
+    && apt update \
+    && apt install -y postgresql-client-14 \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
+    && unzip -q awscliv2.zip \
+    && ./aws/install \
+    && rm awscliv2.zip
+
+# Mold: A Modern Linker
+ENV MOLD_VERSION v2.1.0
+RUN set -e \
+    && git clone https://github.com/rui314/mold.git \
+    && mkdir mold/build \
+    && cd mold/build \
+    && git checkout ${MOLD_VERSION} \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
+    && cmake --build . -j $(nproc) \
+    && cmake --install . \
+    && cd .. \
+    && rm -rf mold
+
+# LCOV
+# Build lcov from a fork:
+# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
+# And patches from us:
+# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
+RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
+    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
+    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
+    && cd lcov \
+    && make install \
+    && rm -rf ../lcov.tar.gz
+
+# Switch to nonroot user
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Python
+ENV PYTHON_VERSION=3.9.2 \
+    PYENV_ROOT=/home/nonroot/.pyenv \
+    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
+RUN set -e \
+    && cd $HOME \
+    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
+    && chmod +x pyenv-installer \
+    && ./pyenv-installer \
+    && export PYENV_ROOT=/home/nonroot/.pyenv \
+    && export PATH="$PYENV_ROOT/bin:$PATH" \
+    && export PATH="$PYENV_ROOT/shims:$PATH" \
+    && pyenv install ${PYTHON_VERSION} \
+    && pyenv global ${PYTHON_VERSION} \
+    && python --version \
+    && pip install --upgrade pip \
+    && pip --version \
+    && pip install pipenv wheel poetry
+
+# Switch to nonroot user (again)
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Rust
+# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
+ENV RUSTC_VERSION=1.74.0
+ENV RUSTUP_HOME="/home/nonroot/.rustup"
+ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
+	chmod +x rustup-init && \
+	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
+	rm rustup-init && \
+    export PATH="$HOME/.cargo/bin:$PATH" && \
+    . "$HOME/.cargo/env" && \
+    cargo --version && rustup --version && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny && \
+    cargo install cargo-hack && \
+    rm -rf /home/nonroot/.cargo/registry && \
+    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot
+
+# Show versions
+RUN whoami \
+    && python --version \
+    && pip --version \
+    && cargo --version --verbose \
+    && rustup --version --verbose \
+    && rustc --version --verbose \
+    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 03280586f8..3809eb09a9 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index 3066e3f7ca..cc305cc556 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 

From 47873470db040b55fbf447a6c392f9a2b1091a02 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 16 Dec 2023 11:52:48 +0100
Subject: [PATCH 113/202] pageserver: add method to dump keyspace in mgmt api
 client (#6145)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771
---
 Cargo.lock                                    |   1 +
 libs/pageserver_api/Cargo.toml                |   1 +
 libs/pageserver_api/src/key.rs                |  34 ++++
 .../pageserver_api}/src/keyspace.rs           |  32 +++-
 libs/pageserver_api/src/lib.rs                |   1 +
 libs/pageserver_api/src/models.rs             |   2 +
 .../pageserver_api/src/models/partitioning.rs | 151 ++++++++++++++++++
 pageserver/client/src/mgmt_api.rs             |  16 ++
 pageserver/src/http/mod.rs                    |   2 -
 pageserver/src/http/routes.rs                 |  77 +--------
 pageserver/src/lib.rs                         |   2 +-
 pageserver/src/repository.rs                  |  29 +---
 12 files changed, 246 insertions(+), 102 deletions(-)
 rename {pageserver => libs/pageserver_api}/src/keyspace.rs (93%)
 create mode 100644 libs/pageserver_api/src/models/partitioning.rs

diff --git a/Cargo.lock b/Cargo.lock
index 96f6b494c4..f931fd6c29 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3167,6 +3167,7 @@ dependencies = [
  "enum-map",
  "hex",
  "postgres_ffi",
+ "rand 0.8.5",
  "serde",
  "serde_json",
  "serde_with",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 4d08d78e87..4146597d8d 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -24,3 +24,4 @@ workspace_hack.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
+rand.workspace = true
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 16715bc667..d680a5600e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -144,3 +144,37 @@ impl Key {
 pub fn is_rel_block_key(key: &Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0
 }
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        Self::from_hex(s)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use crate::key::Key;
+
+    use rand::Rng;
+    use rand::SeedableRng;
+
+    #[test]
+    fn display_fromstr_bijection() {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
+
+        let key = Key {
+            field1: rng.gen(),
+            field2: rng.gen(),
+            field3: rng.gen(),
+            field4: rng.gen(),
+            field5: rng.gen(),
+            field6: rng.gen(),
+        };
+
+        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
+    }
+}
diff --git a/pageserver/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
similarity index 93%
rename from pageserver/src/keyspace.rs
rename to libs/pageserver_api/src/keyspace.rs
index 20e6df9c7b..16651c322e 100644
--- a/pageserver/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,11 +1,12 @@
-use crate::repository::{key_range_size, singleton_range, Key};
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
+use crate::key::Key;
+
 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct KeySpace {
     /// Contiguous ranges of keys that belong to the key space. In key order,
     /// and with no overlap.
@@ -186,6 +187,33 @@ impl KeySpaceRandomAccum {
     }
 }
 
+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
+pub fn singleton_range(key: Key) -> Range<Key> {
+    key..key.next()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 511c5ed208..b236b93428 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,6 +5,7 @@ use const_format::formatcp;
 /// Public API types
 pub mod control_api;
 pub mod key;
+pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index fbc7d73235..a78ba8ad94 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,3 +1,5 @@
+pub mod partitioning;
+
 use std::{
     collections::HashMap,
     num::{NonZeroU64, NonZeroUsize},
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
new file mode 100644
index 0000000000..0d287f7be0
--- /dev/null
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -0,0 +1,151 @@
+use utils::lsn::Lsn;
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Partitioning {
+    pub keys: crate::keyspace::KeySpace,
+
+    pub at_lsn: Lsn,
+}
+
+impl serde::Serialize for Partitioning {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+        impl<'a> serde::Serialize for KeySpace<'a> {
+            fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+            where
+                S: serde::Serializer,
+            {
+                use serde::ser::SerializeSeq;
+                let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+                for kr in &self.0.ranges {
+                    seq.serialize_element(&KeyRange(kr))?;
+                }
+                seq.end()
+            }
+        }
+
+        use serde::ser::SerializeMap;
+        let mut map = serializer.serialize_map(Some(2))?;
+        map.serialize_key("keys")?;
+        map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("at_lsn")?;
+        map.serialize_value(&WithDisplay(&self.at_lsn))?;
+        map.end()
+    }
+}
+
+pub struct WithDisplay<'a, T>(&'a T);
+
+impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(&self.0)
+    }
+}
+
+pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
+
+impl<'a> serde::Serialize for KeyRange<'a> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeTuple;
+        let mut t = serializer.serialize_tuple(2)?;
+        t.serialize_element(&WithDisplay(&self.0.start))?;
+        t.serialize_element(&WithDisplay(&self.0.end))?;
+        t.end()
+    }
+}
+
+impl<'a> serde::Deserialize<'a> for Partitioning {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'a>,
+    {
+        pub struct KeySpace(crate::keyspace::KeySpace);
+
+        impl<'de> serde::Deserialize<'de> for KeySpace {
+            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                #[serde_with::serde_as]
+                #[derive(serde::Deserialize)]
+                #[serde(transparent)]
+                struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
+
+                #[serde_with::serde_as]
+                #[derive(serde::Deserialize)]
+                struct Range(Key, Key);
+
+                let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
+                Ok(Self(crate::keyspace::KeySpace {
+                    ranges: ranges
+                        .into_iter()
+                        .map(|Range(start, end)| (start.0..end.0))
+                        .collect(),
+                }))
+            }
+        }
+
+        #[serde_with::serde_as]
+        #[derive(serde::Deserialize)]
+        struct De {
+            keys: KeySpace,
+            #[serde_as(as = "serde_with::DisplayFromStr")]
+            at_lsn: Lsn,
+        }
+
+        let de: De = serde::Deserialize::deserialize(deserializer)?;
+        Ok(Self {
+            at_lsn: de.at_lsn,
+            keys: de.keys.0,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_serialization_roundtrip() {
+        let reference = r#"
+        {
+            "keys": [
+              [
+                "000000000000000000000000000000000000",
+                "000000000000000000000000000000000001"
+              ],
+              [
+                "000000067F00000001000000000000000000",
+                "000000067F00000001000000000000000002"
+              ],
+              [
+                "030000000000000000000000000000000000",
+                "030000000000000000000000000000000003"
+              ]
+            ],
+            "at_lsn": "0/2240160"
+        }
+        "#;
+
+        let de: Partitioning = serde_json::from_str(reference).unwrap();
+
+        let ser = serde_json::to_string(&de).unwrap();
+
+        let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
+
+        assert_eq!(
+            ser_de,
+            serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
+        );
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 821c238082..77eb1bb8e2 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -92,6 +92,22 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn keyspace(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            self.mgmt_api_endpoint
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }
diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs
index 1c083bd382..c82d1c0362 100644
--- a/pageserver/src/http/mod.rs
+++ b/pageserver/src/http/mod.rs
@@ -1,4 +1,2 @@
 pub mod routes;
 pub use routes::make_router;
-
-pub use pageserver_api::models;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 47092c3bb0..601fad5bde 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,10 +28,6 @@ use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
-use super::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
-};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
@@ -51,6 +47,10 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
+use pageserver_api::models::{
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+};
 use utils::{
     auth::SwappableJwtAuth,
     generation::Generation,
@@ -66,7 +66,7 @@ use utils::{
 };
 
 // Imports only used for testing APIs
-use super::models::ConfigureFailpointsRequest;
+use pageserver_api::models::ConfigureFailpointsRequest;
 
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
@@ -1524,69 +1524,6 @@ async fn timeline_collect_keyspace(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    struct Partitioning {
-        keys: crate::keyspace::KeySpace,
-
-        at_lsn: Lsn,
-    }
-
-    impl serde::Serialize for Partitioning {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeMap;
-            let mut map = serializer.serialize_map(Some(2))?;
-            map.serialize_key("keys")?;
-            map.serialize_value(&KeySpace(&self.keys))?;
-            map.serialize_key("at_lsn")?;
-            map.serialize_value(&WithDisplay(&self.at_lsn))?;
-            map.end()
-        }
-    }
-
-    struct WithDisplay<'a, T>(&'a T);
-
-    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            serializer.collect_str(&self.0)
-        }
-    }
-
-    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
-
-    impl<'a> serde::Serialize for KeySpace<'a> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeSeq;
-            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
-            for kr in &self.0.ranges {
-                seq.serialize_element(&KeyRange(kr))?;
-            }
-            seq.end()
-        }
-    }
-
-    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
-
-    impl<'a> serde::Serialize for KeyRange<'a> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeTuple;
-            let mut t = serializer.serialize_tuple(2)?;
-            t.serialize_element(&WithDisplay(&self.0.start))?;
-            t.serialize_element(&WithDisplay(&self.0.end))?;
-            t.end()
-        }
-    }
-
     let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
 
     async {
@@ -1598,7 +1535,9 @@ async fn timeline_collect_keyspace(
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
-        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+
+        json_response(StatusCode::OK, res)
     }
     .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 797cb6f944..58adf6e8c4 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -10,7 +10,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
-pub mod keyspace;
+pub use pageserver_api::keyspace;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 97d731bf49..c726139524 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -2,38 +2,11 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::{AddAssign, Range};
+use std::ops::AddAssign;
 use std::time::Duration;
 
 pub use pageserver_api::key::{Key, KEY_SIZE};
 
-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
-pub fn singleton_range(key: Key) -> Range<Key> {
-    key..key.next()
-}
-
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[cfg_attr(test, derive(PartialEq))]

From 9bdc25f0af272e15997a9103d56c08f8260fe72e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 16 Dec 2023 12:32:00 +0000
Subject: [PATCH 114/202] Revert "CI: build build-tools image" (#6156)

It turns out the issue with skipped jobs is not so trivial (because
Github checks jobs transitively), a possible workaround with `if:
always() && contains(fromJSON('["success", "skipped"]'),
needs.build-buildtools-image.result)` will tangle the workflow really
bad. We'll need to come up with a better solution.

To unblock the main I'm going to revert
https://github.com/neondatabase/neon/pull/6082.
---
 .../workflows/build_and_push_docker_image.yml |  57 ------
 .github/workflows/build_and_test.yml          |  82 ++-------
 .../workflows/update_build_tools_image.yml    | 130 --------------
 .gitignore                                    |   1 -
 CONTRIBUTING.md                               |  14 --
 Dockerfile                                    |   2 +-
 Dockerfile.buildtools                         | 165 ------------------
 Dockerfile.compute-node                       |   2 +-
 Dockerfile.compute-tools                      |   2 +-
 9 files changed, 21 insertions(+), 434 deletions(-)
 delete mode 100644 .github/workflows/build_and_push_docker_image.yml
 delete mode 100644 .github/workflows/update_build_tools_image.yml
 delete mode 100644 Dockerfile.buildtools

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
deleted file mode 100644
index 8780c0fef7..0000000000
--- a/.github/workflows/build_and_push_docker_image.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-      tag:
-        required: true
-        type: string
-
-jobs:
-  kaniko:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64
-
-  kaniko-arm:
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
-
-  manifest:
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - kaniko
-      - kaniko-arm
-
-    steps:
-      - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 808d39b33c..693ed1a66f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,13 +44,13 @@ jobs:
 
         exit 1
 
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
 
     steps:
       - name: Checkout
@@ -74,50 +74,11 @@ jobs:
         shell: bash
         id: build-tag
 
-      - name: Get buildtools tag
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-          
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  check-if-build-tools-dockerfile-changed:
-    needs: [ check-permissions ]
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  build-buildtools-image:
-    needs: [ check-permissions, check-if-build-tools-dockerfile-changed, tag]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
-      tag: ${{ needs.tag.outputs.build-tools-tag}}
-    secrets: inherit
-
   check-codestyle-python:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image , tag ]
+    needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     steps:
@@ -147,11 +108,10 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     steps:
@@ -215,11 +175,10 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
     strategy:
       fail-fast: false
@@ -452,7 +411,7 @@ jobs:
     needs: [ check-permissions, build-neon, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -488,10 +447,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon, tag]
+    needs: [ check-permissions, build-neon ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -520,12 +479,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, tag ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     steps:
@@ -567,10 +526,11 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests, tag ]
+    needs: [ check-permissions, regress-tests ]
+
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
     strategy:
       fail-fast: false
@@ -734,8 +694,7 @@ jobs:
             }"
 
   neon-image:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -774,7 +733,6 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.tag.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -784,9 +742,8 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-tools-image:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -821,7 +778,6 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -832,8 +788,7 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -881,7 +836,6 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
deleted file mode 100644
index 88bab797b7..0000000000
--- a/.github/workflows/update_build_tools_image.yml
+++ /dev/null
@@ -1,130 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Restore previous tag if needed
-        run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
-
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
diff --git a/.gitignore b/.gitignore
index 3f4495c9e7..c5fc121ac2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
 test_output/
 .vscode
 .idea
-neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b318c295a3..2692684006 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
-
-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
-
-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
-
-```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 5d5fde4f14..60de9cfa3e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
deleted file mode 100644
index d3d05b4e20..0000000000
--- a/Dockerfile.buildtools
+++ /dev/null
@@ -1,165 +0,0 @@
-FROM debian:bullseye-slim
-
-# Add nonroot user
-RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
-
-# System deps
-RUN set -e \
-    && apt update \
-    && apt install -y \
-        autoconf \
-        automake \
-        bison \
-        build-essential \
-        ca-certificates \
-        cmake \
-        curl \
-        flex \
-        git \
-        gnupg \
-        gzip \
-        jq \
-        libcurl4-openssl-dev \
-        libbz2-dev \
-        libffi-dev \
-        liblzma-dev \
-        libncurses5-dev \
-        libncursesw5-dev \
-        libpq-dev \
-        libreadline-dev \
-        libseccomp-dev \
-        libsqlite3-dev \
-        libssl-dev \
-        libstdc++-10-dev \
-        libtool \
-        libxml2-dev \
-        libxmlsec1-dev \
-        libxxhash-dev \
-        lsof \
-        make \
-        netcat \
-        net-tools \
-        openssh-client \
-        parallel \
-        pkg-config \
-        unzip \
-        wget \
-        xz-utils \
-        zlib1g-dev \
-        zstd \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# protobuf-compiler (protoc)
-ENV PROTOC_VERSION 22.2
-RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
-    && unzip -q protoc.zip -d protoc \
-    && mv protoc/bin/protoc /usr/local/bin/protoc \
-    && mv protoc/include/google /usr/local/include/google \
-    && rm -rf protoc.zip protoc
-
-# LLVM
-ENV LLVM_VERSION=17
-RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
-    && apt update \
-    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
-    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# PostgreSQL 14
-RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
-    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
-    && apt update \
-    && apt install -y postgresql-client-14 \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# AWS CLI
-RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
-    && unzip -q awscliv2.zip \
-    && ./aws/install \
-    && rm awscliv2.zip
-
-# Mold: A Modern Linker
-ENV MOLD_VERSION v2.1.0
-RUN set -e \
-    && git clone https://github.com/rui314/mold.git \
-    && mkdir mold/build \
-    && cd mold/build \
-    && git checkout ${MOLD_VERSION} \
-    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
-    && cmake --build . -j $(nproc) \
-    && cmake --install . \
-    && cd .. \
-    && rm -rf mold
-
-# LCOV
-# Build lcov from a fork:
-# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
-# And patches from us:
-# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
-    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
-    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
-    && cd lcov \
-    && make install \
-    && rm -rf ../lcov.tar.gz
-
-# Switch to nonroot user
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Python
-ENV PYTHON_VERSION=3.9.2 \
-    PYENV_ROOT=/home/nonroot/.pyenv \
-    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
-RUN set -e \
-    && cd $HOME \
-    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
-    && chmod +x pyenv-installer \
-    && ./pyenv-installer \
-    && export PYENV_ROOT=/home/nonroot/.pyenv \
-    && export PATH="$PYENV_ROOT/bin:$PATH" \
-    && export PATH="$PYENV_ROOT/shims:$PATH" \
-    && pyenv install ${PYTHON_VERSION} \
-    && pyenv global ${PYTHON_VERSION} \
-    && python --version \
-    && pip install --upgrade pip \
-    && pip --version \
-    && pip install pipenv wheel poetry
-
-# Switch to nonroot user (again)
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Rust
-# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.74.0
-ENV RUSTUP_HOME="/home/nonroot/.rustup"
-ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
-	chmod +x rustup-init && \
-	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
-	rm rustup-init && \
-    export PATH="$HOME/.cargo/bin:$PATH" && \
-    . "$HOME/.cargo/env" && \
-    cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
-    cargo install cargo-hack && \
-    rm -rf /home/nonroot/.cargo/registry && \
-    rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
-
-# Show versions
-RUN whoami \
-    && python --version \
-    && pip --version \
-    && cargo --version --verbose \
-    && rustup --version --verbose \
-    && rustc --version --verbose \
-    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 3809eb09a9..03280586f8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index cc305cc556..3066e3f7ca 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 

From e1935f42a1d1de4948fa6f689f2fe4cd83d15549 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 27 Nov 2023 15:40:01 +0300
Subject: [PATCH 115/202] Don't generate core dump when walproposer
 intentionally panics.

Walproposer sometimes intentionally PANICs when its term is defeated as the
basebackup is likely spoiled by that time. We don't want core dumped in this
case.
---
 pgxn/neon/neon_utils.c  | 25 +++++++++++++++++++++++++
 pgxn/neon/neon_utils.h  |  1 +
 pgxn/neon/walproposer.c | 15 ++++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 06faea7490..807d2decf6 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,3 +1,6 @@
+
+#include <sys/resource.h>
+
 #include "postgres.h"
 
 #include "access/timeline.h"
@@ -114,3 +117,25 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
 }
+
+/*
+ * Disables core dump for the current process.
+ */
+void
+disable_core_dump()
+{
+	struct rlimit rlim;
+
+#ifdef WALPROPOSER_LIB			/* skip in simulation mode */
+	return;
+#endif
+
+	rlim.rlim_cur = 0;
+	rlim.rlim_max = 0;
+	if (setrlimit(RLIMIT_CORE, &rlim))
+	{
+		int			save_errno = errno;
+
+		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
+	}
+}
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index e3fafc8d0f..20745d8b26 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -8,5 +8,6 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
+extern void disable_core_dump();
 
 #endif							/* __NEON_UTILS_H__ */
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 7d9dbfdb7f..fc3332612c 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -35,6 +35,8 @@
  *
  *-------------------------------------------------------------------------
  */
+#include <sys/resource.h>
+
 #include "postgres.h"
 #include "libpq/pqformat.h"
 #include "neon.h"
@@ -1069,6 +1071,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
+				/*
+				 * Panic to restart PG as we need to retake basebackup.
+				 * However, don't dump core as this is kinda expected
+				 * scenario.
+				 */
+				disable_core_dump();
 				walprop_log(PANIC,
 							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
@@ -1445,7 +1453,12 @@ RecvAppendResponses(Safekeeper *sk)
 
 		if (sk->appendResponse.term > wp->propTerm)
 		{
-			/* Another compute with higher term is running. */
+			/*
+			 * Another compute with higher term is running. Panic to restart
+			 * PG as we likely need to retake basebackup. However, don't dump
+			 * core as this is kinda expected scenario.
+			 */
+			disable_core_dump();
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
 						sk->host, sk->port,
 						sk->appendResponse.term, wp->propTerm);

From dbdb1d21f2eae8e9616dd12656868fcdb59c603e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Dec 2023 10:29:19 +0000
Subject: [PATCH 116/202] pageserver: on-demand activation cleanups (#6157)

## Problem

#6112 added some logs and metrics: clean these up a bit:
- Avoid counting startup completions for tenants launched after startup
- exclude no-op cases from timing histograms
- remove a rogue log messages
---
 pageserver/src/tenant.rs | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1478a1a445..eceef6bf78 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -629,9 +629,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                // Is this tenant being spawned as part of process startup?
+                let starting_up = init_order.is_some();
                 scopeguard::defer! {
-                    tracing::info!("Increment complete count");
-                    TENANT.startup_complete.inc();
+                    if starting_up {
+                        TENANT.startup_complete.inc();
+                    }
                 }
 
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
@@ -711,7 +714,11 @@ impl Tenant {
 
                 let preload_timer = TENANT.preload.start_timer();
                 let preload = match mode {
-                    SpawnMode::Create => {None},
+                    SpawnMode::Create => {
+                        // Don't count the skipped preload into the histogram of preload durations
+                        preload_timer.stop_and_discard();
+                        None
+                    },
                     SpawnMode::Normal => {
                         match &remote_storage {
                             Some(remote_storage) => Some(
@@ -721,7 +728,11 @@ impl Tenant {
                                         tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
                                     )
                                     .await {
-                                        Ok(p) => p,
+                                        Ok(p) => {
+                                            preload_timer.observe_duration();
+                                            p
+                                        }
+                                            ,
                                         Err(e) => {
                                             make_broken(&tenant_clone, anyhow::anyhow!(e));
                                                 return Ok(());
@@ -732,7 +743,6 @@ impl Tenant {
                         }
                     }
                 };
-                preload_timer.observe_duration();
 
                 // Remote preload is complete.
                 drop(remote_load_completion);
@@ -784,15 +794,19 @@ impl Tenant {
                     }
                 }
 
-                let attach_timer = TENANT.attach.start_timer();
+                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_timer = match mode {
+                    SpawnMode::Create => None,
+                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                };
                 match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }

From 17bde7eda516302a57f46bd95140e8d89e68af73 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 18 Dec 2023 10:59:49 +0000
Subject: [PATCH 117/202] proxy refactor large files (#6153)

## Problem

The `src/proxy.rs` file is far too large

## Summary of changes

Creates 3 new files:
```
src/metrics.rs
src/proxy/retry.rs
src/proxy/connect_compute.rs
```
---
 proxy/src/auth/backend.rs             |   4 +-
 proxy/src/auth/backend/classic.rs     |   2 +-
 proxy/src/auth/backend/hacks.rs       |   2 +-
 proxy/src/auth/credentials.rs         |   5 +-
 proxy/src/compute.rs                  |   7 +-
 proxy/src/console/provider.rs         |   2 +-
 proxy/src/console/provider/neon.rs    |   2 +-
 proxy/src/http.rs                     |   2 +-
 proxy/src/lib.rs                      |   1 +
 proxy/src/metrics.rs                  | 232 +++++++++++
 proxy/src/proxy.rs                    | 542 +-------------------------
 proxy/src/proxy/connect_compute.rs    | 238 +++++++++++
 proxy/src/proxy/retry.rs              |  68 ++++
 proxy/src/proxy/tests.rs              |   5 +-
 proxy/src/proxy/tests/mitm.rs         |   4 +-
 proxy/src/rate_limiter/limiter.rs     |   6 +-
 proxy/src/serverless.rs               |   2 +-
 proxy/src/serverless/conn_pool.rs     |   7 +-
 proxy/src/serverless/sql_over_http.rs |   2 +-
 19 files changed, 579 insertions(+), 554 deletions(-)
 create mode 100644 proxy/src/metrics.rs
 create mode 100644 proxy/src/proxy/connect_compute.rs
 create mode 100644 proxy/src/proxy/retry.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ba054b53eb..3b09e05bd2 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -11,7 +11,8 @@ use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
-use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
+use crate::proxy::connect_compute::handle_try_wake;
+use crate::proxy::retry::retry_after;
 use crate::scram;
 use crate::stream::Stream;
 use crate::{
@@ -22,6 +23,7 @@ use crate::{
         provider::{CachedNodeInfo, ConsoleReqExtra},
         Api,
     },
+    metrics::LatencyTimer,
     stream, url,
 };
 use futures::TryFutureExt;
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index ce52daf16c..5c394ec649 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
-    proxy::LatencyTimer,
+    metrics::LatencyTimer,
     sasl,
     stream::{PqStream, Stream},
 };
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index abbd25008b..5dde514bca 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
     auth::{self, AuthFlow},
     console::AuthSecret,
-    proxy::LatencyTimer,
+    metrics::LatencyTimer,
     sasl,
     stream::{self, Stream},
 };
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 72149e8e29..c04769a199 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,9 +1,8 @@
 //! User credentials used in authentication.
 
 use crate::{
-    auth::password_hack::parse_endpoint_param,
-    error::UserFacingError,
-    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
+    auth::password_hack::parse_endpoint_param, error::UserFacingError,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index f5f7270bf4..a54ba56e43 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,9 +1,6 @@
 use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    console::errors::WakeComputeError,
-    error::UserFacingError,
-    proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
+    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
+    error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index deab966d9e..8d399f26ea 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -21,7 +21,7 @@ pub mod errors {
     use crate::{
         error::{io_error, UserFacingError},
         http,
-        proxy::ShouldRetry,
+        proxy::retry::ShouldRetry,
     };
     use thiserror::Error;
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 192252a0df..f748c9a41f 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -5,7 +5,7 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
+use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 09423eca77..59e1492ed4 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;
 
-use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
 use reqwest_middleware::RequestBuilder;
 
 /// This is the preferred way to create new http clients,
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a22600cbb3..2da1eaf482 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
+pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
new file mode 100644
index 0000000000..8e2a6105b1
--- /dev/null
+++ b/proxy/src/metrics.rs
@@ -0,0 +1,232 @@
+use ::metrics::{
+    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
+    IntCounterPairVec, IntCounterVec,
+};
+use prometheus::{
+    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
+    IntGaugeVec,
+};
+
+use once_cell::sync::Lazy;
+use tokio::time;
+
+pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_db_connections_total",
+        "Number of opened connections to a database.",
+        "proxy_closed_db_connections_total",
+        "Number of closed connections to a database.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_client_connections_total",
+        "Number of opened connections from a client.",
+        "proxy_closed_client_connections_total",
+        "Number of closed connections from a client.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_accepted_connections_total",
+        "Number of client connections accepted.",
+        "proxy_closed_connections_total",
+        "Number of client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // http/ws/tcp, true/false, true/false, success/failure
+        // 3 * 2 * 2 * 2 = 24 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_console_request_latency",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // proxy_wake_compute/proxy_get_role_info
+        &["request"],
+        // largest bucket = 2^16 * 0.2ms = 13s
+        exponential_buckets(0.0002, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_allowed_ips_cache_misses",
+        "Number of cache hits/misses for allowed ips",
+        // hit/miss
+        &["outcome"],
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_control_plane_token_acquire_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(0.00005, 3.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "semaphore_control_plane_limit",
+        "Current limit of the semaphore control plane",
+        &["limit"], // 2 counters
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_accepted_connections_by_sni",
+        "Number of connections (per sni).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_allowed_ips_number",
+        "Number of allowed ips",
+        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .unwrap()
+});
+
+pub struct LatencyTimer {
+    // time since the stopwatch was started
+    start: Option<time::Instant>,
+    // accumulated time on the stopwatch
+    accumulated: std::time::Duration,
+    // label data
+    protocol: &'static str,
+    cache_miss: bool,
+    pool_miss: bool,
+    outcome: &'static str,
+}
+
+pub struct LatencyTimerPause<'a> {
+    timer: &'a mut LatencyTimer,
+}
+
+impl LatencyTimer {
+    pub fn new(protocol: &'static str) -> Self {
+        Self {
+            start: Some(time::Instant::now()),
+            accumulated: std::time::Duration::ZERO,
+            protocol,
+            cache_miss: false,
+            // by default we don't do pooling
+            pool_miss: true,
+            // assume failed unless otherwise specified
+            outcome: "failed",
+        }
+    }
+
+    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
+        // stop the stopwatch and record the time that we have accumulated
+        let start = self.start.take().expect("latency timer should be started");
+        self.accumulated += start.elapsed();
+        LatencyTimerPause { timer: self }
+    }
+
+    pub fn cache_miss(&mut self) {
+        self.cache_miss = true;
+    }
+
+    pub fn pool_hit(&mut self) {
+        self.pool_miss = false;
+    }
+
+    pub fn success(mut self) {
+        self.outcome = "success";
+    }
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        // start the stopwatch again
+        self.timer.start = Some(time::Instant::now());
+    }
+}
+
+impl Drop for LatencyTimer {
+    fn drop(&mut self) {
+        let duration =
+            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+            ])
+            .observe(duration.as_secs_f64())
+    }
+}
+
+pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_total",
+        "Number of connection failures (per kind).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_breakdown",
+        "Number of wake-up failures (per kind).",
+        &["retry", "kind"],
+    )
+    .unwrap()
+});
+
+pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes_per_client",
+        "Number of bytes sent/received between client and backend.",
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
+    )
+    .unwrap()
+});
+
+pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes",
+        "Number of bytes sent/received between all clients and backends.",
+        &["direction"],
+    )
+    .unwrap()
+});
+
+pub const fn bool_to_str(x: bool) -> &'static str {
+    if x {
+        "true"
+    } else {
+        "false"
+    }
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index da65065179..17e910860c 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,265 +1,41 @@
 #[cfg(test)]
 mod tests;
 
+pub mod connect_compute;
+pub mod retry;
+
 use crate::{
     auth,
     cancellation::{self, CancelMap},
-    compute::{self, PostgresConnection},
+    compute,
     config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
-    http::StatusCode,
+    console::{self, messages::MetricsAuxInfo},
+    metrics::{
+        LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
+        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
+    },
     protocol2::WithClientIp,
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
-};
 use regex::Regex;
-use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
-use tokio::{
-    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
-    time,
-};
+use std::{net::IpAddr, sync::Arc};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, info_span, Instrument};
 use utils::measured_stream::MeasuredStream;
 
-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
+use self::connect_compute::{connect_to_compute, TcpMechanism};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
 
-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
-        // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
-
-pub struct LatencyTimer {
-    // time since the stopwatch was started
-    start: Option<Instant>,
-    // accumulated time on the stopwatch
-    accumulated: std::time::Duration,
-    // label data
-    protocol: &'static str,
-    cache_miss: bool,
-    pool_miss: bool,
-    outcome: &'static str,
-}
-
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-}
-
-impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
-        Self {
-            start: Some(Instant::now()),
-            accumulated: std::time::Duration::ZERO,
-            protocol,
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
-            // assume failed unless otherwise specified
-            outcome: "failed",
-        }
-    }
-
-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
-    }
-
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
-    }
-
-    pub fn success(mut self) {
-        self.outcome = "success";
-    }
-}
-
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(Instant::now());
-    }
-}
-
-impl Drop for LatencyTimer {
-    fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
-            ])
-            .observe(duration.as_secs_f64())
-    }
-}
-
-static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
 pub async fn run_until_cancelled<F: std::future::Future>(
     f: F,
     cancellation_token: &CancellationToken,
@@ -539,296 +315,6 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     }
 }
 
-/// If we couldn't connect, a cached connection info might be to blame
-/// (e.g. the compute node's address might've changed at the wrong time).
-/// Invalidate the cache entry (if any) to prevent subsequent errors.
-#[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
-    let is_cached = node_info.cached();
-    if is_cached {
-        warn!("invalidating stalled compute node info cache entry");
-    }
-    let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
-    };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
-
-    node_info.invalidate().config
-}
-
-/// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    timeout: time::Duration,
-    proto: &'static str,
-) -> Result<PostgresConnection, compute::ConnectionError> {
-    let allow_self_signed_compute = node_info.allow_self_signed_compute;
-
-    node_info
-        .config
-        .connect(allow_self_signed_compute, timeout, proto)
-        .await
-}
-
-#[async_trait]
-pub trait ConnectMechanism {
-    type Connection;
-    type ConnectError;
-    type Error: From<Self::ConnectError>;
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError>;
-
-    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
-}
-
-pub struct TcpMechanism<'a> {
-    /// KV-dictionary with PostgreSQL connection params.
-    pub params: &'a StartupMessageParams,
-    pub proto: &'static str,
-}
-
-#[async_trait]
-impl ConnectMechanism for TcpMechanism<'_> {
-    type Connection = PostgresConnection;
-    type ConnectError = compute::ConnectionError;
-    type Error = compute::ConnectionError;
-
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(node_info, timeout, self.proto).await
-    }
-
-    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
-    }
-}
-
-const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
-    }
-}
-
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
-/// Try to connect to the compute node, retrying if necessary.
-/// This function might update `node_info`, so we take it by `&mut`.
-#[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
-    mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra,
-    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
-    mut latency_timer: LatencyTimer,
-) -> Result<M::Connection, M::Error>
-where
-    M::ConnectError: ShouldRetry + std::fmt::Debug,
-    M::Error: From<WakeComputeError>,
-{
-    mechanism.update_connect_config(&mut node_info.config);
-
-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => {
-            latency_timer.success();
-            return Ok(res);
-        }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
-
-    latency_timer.cache_miss();
-
-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            #[cfg(feature = "testing")]
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
-    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => {
-                latency_timer.success();
-                return Ok(res);
-            }
-            Err(e) => {
-                let retriable = e.should_retry(num_retries);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-                    return Err(e.into());
-                }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    }
-}
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
-
-pub trait ShouldRetry {
-    fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
-        match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
-            err => err.could_retry(),
-        }
-    }
-}
-
-impl ShouldRetry for io::Error {
-    fn could_retry(&self) -> bool {
-        use std::io::ErrorKind;
-        matches!(
-            self.kind(),
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::error::DbError {
-    fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
-        matches!(
-            self.code(),
-            &SqlState::CONNECTION_FAILURE
-                | &SqlState::CONNECTION_EXCEPTION
-                | &SqlState::CONNECTION_DOES_NOT_EXIST
-                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::Error {
-    fn could_retry(&self) -> bool {
-        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
-            io::Error::could_retry(io_err)
-        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
-        } else {
-            false
-        }
-    }
-}
-
-impl ShouldRetry for compute::ConnectionError {
-    fn could_retry(&self) -> bool {
-        match self {
-            compute::ConnectionError::Postgres(err) => err.could_retry(),
-            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
-            _ => false,
-        }
-    }
-}
-
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
-}
-
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
new file mode 100644
index 0000000000..88b0019c49
--- /dev/null
+++ b/proxy/src/proxy/connect_compute.rs
@@ -0,0 +1,238 @@
+use crate::{
+    auth,
+    compute::{self, PostgresConnection},
+    console::{self, errors::WakeComputeError, Api},
+    metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
+    proxy::retry::{retry_after, ShouldRetry},
+};
+use async_trait::async_trait;
+use hyper::StatusCode;
+use pq_proto::StartupMessageParams;
+use std::ops::ControlFlow;
+use tokio::time;
+use tracing::{error, info, warn};
+
+const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
+
+/// If we couldn't connect, a cached connection info might be to blame
+/// (e.g. the compute node's address might've changed at the wrong time).
+/// Invalidate the cache entry (if any) to prevent subsequent errors.
+#[tracing::instrument(name = "invalidate_cache", skip_all)]
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+    let is_cached = node_info.cached();
+    if is_cached {
+        warn!("invalidating stalled compute node info cache entry");
+    }
+    let label = match is_cached {
+        true => "compute_cached",
+        false => "compute_uncached",
+    };
+    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+
+    node_info.invalidate().config
+}
+
+/// Try to connect to the compute node once.
+#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
+    proto: &'static str,
+) -> Result<PostgresConnection, compute::ConnectionError> {
+    let allow_self_signed_compute = node_info.allow_self_signed_compute;
+
+    node_info
+        .config
+        .connect(allow_self_signed_compute, timeout, proto)
+        .await
+}
+
+#[async_trait]
+pub trait ConnectMechanism {
+    type Connection;
+    type ConnectError;
+    type Error: From<Self::ConnectError>;
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError>;
+
+    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
+}
+
+pub struct TcpMechanism<'a> {
+    /// KV-dictionary with PostgreSQL connection params.
+    pub params: &'a StartupMessageParams,
+    pub proto: &'static str,
+}
+
+#[async_trait]
+impl ConnectMechanism for TcpMechanism<'_> {
+    type Connection = PostgresConnection;
+    type ConnectError = compute::ConnectionError;
+    type Error = compute::ConnectionError;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<PostgresConnection, Self::Error> {
+        connect_to_compute_once(node_info, timeout, self.proto).await
+    }
+
+    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
+        config.set_startup_params(self.params);
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
+
+/// Try to connect to the compute node, retrying if necessary.
+/// This function might update `node_info`, so we take it by `&mut`.
+#[tracing::instrument(skip_all)]
+pub async fn connect_to_compute<M: ConnectMechanism>(
+    mechanism: &M,
+    mut node_info: console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra,
+    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    mut latency_timer: LatencyTimer,
+) -> Result<M::Connection, M::Error>
+where
+    M::ConnectError: ShouldRetry + std::fmt::Debug,
+    M::Error: From<WakeComputeError>,
+{
+    mechanism.update_connect_config(&mut node_info.config);
+
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => {
+            latency_timer.success();
+            return Ok(res);
+        }
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };
+
+    latency_timer.cache_miss();
+
+    let mut num_retries = 1;
+
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            #[cfg(feature = "testing")]
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            #[cfg(test)]
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
+
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e.into());
+            }
+            // failed to wake up but we can continue to retry
+            Ok(ControlFlow::Continue(e)) => {
+                report_error(&e, true);
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            // successfully woke up a compute node and can break the wakeup loop
+            Ok(ControlFlow::Break(mut node_info)) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    };
+
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => {
+                latency_timer.success();
+                return Ok(res);
+            }
+            Err(e) => {
+                let retriable = e.should_retry(num_retries);
+                if !retriable {
+                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    return Err(e.into());
+                }
+                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
new file mode 100644
index 0000000000..a85ed380b0
--- /dev/null
+++ b/proxy/src/proxy/retry.rs
@@ -0,0 +1,68 @@
+use crate::compute;
+use std::{error::Error, io};
+use tokio::time;
+
+/// Number of times we should retry the `/proxy_wake_compute` http request.
+/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
+pub const NUM_RETRIES_CONNECT: u32 = 16;
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
+const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
+
+pub trait ShouldRetry {
+    fn could_retry(&self) -> bool;
+    fn should_retry(&self, num_retries: u32) -> bool {
+        match self {
+            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            err => err.could_retry(),
+        }
+    }
+}
+
+impl ShouldRetry for io::Error {
+    fn could_retry(&self) -> bool {
+        use std::io::ErrorKind;
+        matches!(
+            self.kind(),
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::error::DbError {
+    fn could_retry(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        matches!(
+            self.code(),
+            &SqlState::CONNECTION_FAILURE
+                | &SqlState::CONNECTION_EXCEPTION
+                | &SqlState::CONNECTION_DOES_NOT_EXIST
+                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::Error {
+    fn could_retry(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::could_retry(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::could_retry(db_err)
+        } else {
+            false
+        }
+    }
+}
+
+impl ShouldRetry for compute::ConnectionError {
+    fn could_retry(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.could_retry(),
+            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
+            _ => false,
+        }
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 4691abbfb9..3c483c59ee 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,10 +2,13 @@
 
 mod mitm;
 
+use super::connect_compute::ConnectMechanism;
+use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
 use crate::config::CertResolver;
 use crate::console::{CachedNodeInfo, NodeInfo};
+use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -423,7 +426,7 @@ impl ConnectMechanism for TestConnectMechanism {
     async fn connect_once(
         &self,
         _node_info: &console::CachedNodeInfo,
-        _timeout: time::Duration,
+        _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
         let mut counter = self.counter.lock().unwrap();
         let action = self.sequence[*counter];
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 50b3034936..a0a84a1dc0 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -120,7 +120,7 @@ where
 struct PgFrame;
 impl Decoder for PgFrame {
     type Item = Bytes;
-    type Error = io::Error;
+    type Error = std::io::Error;
 
     fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
         if src.len() < 5 {
@@ -136,7 +136,7 @@ impl Decoder for PgFrame {
     }
 }
 impl Encoder<Bytes> for PgFrame {
-    type Error = io::Error;
+    type Error = std::io::Error;
 
     fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> {
         dst.extend_from_slice(&item);
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 8dfdfcd3db..a190b2cf8f 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -393,10 +393,10 @@ impl Limiter {
             }
             new_limit
         };
-        crate::proxy::RATE_LIMITER_LIMIT
+        crate::metrics::RATE_LIMITER_LIMIT
             .with_label_values(&["expected"])
             .set(new_limit as i64);
-        crate::proxy::RATE_LIMITER_LIMIT
+        crate::metrics::RATE_LIMITER_LIMIT
             .with_label_values(&["actual"])
             .set(actual_limit as i64);
         self.limits.store(new_limit, Ordering::Release);
@@ -470,7 +470,7 @@ impl reqwest_middleware::Middleware for Limiter {
                 )
             })?;
         info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
         match next.run(req, extensions).await {
             Ok(response) => {
                 self.release(token, Some(Outcome::from_reqwest_response(&response)))
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 870e9c1103..e358a0712f 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -13,8 +13,8 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
+use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
-use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 69198d79d3..ab8903418b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -24,13 +24,12 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 use crate::{
     auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
     console,
-    proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
+    metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
+    proxy::{connect_compute::ConnectMechanism, neon_options},
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
 
-use crate::proxy::ConnectMechanism;
-
 use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
@@ -444,7 +443,7 @@ async fn connect_to_compute(
         .await?
         .context("missing cache entry from wake_compute")?;
 
-    crate::proxy::connect_to_compute(
+    crate::proxy::connect_compute::connect_to_compute(
         &TokioMechanism {
             conn_id,
             conn_info,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 795ba819c1..307b085ce0 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -29,7 +29,7 @@ use utils::http::error::ApiError;
 use utils::http::json::json_response;
 
 use crate::config::HttpConfig;
-use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;

From 33cb9a68f7dde14a851ff5893f7107b4fc53856e Mon Sep 17 00:00:00 2001
From: John Khvatov <ivaxer@yandex-team.com>
Date: Mon, 18 Dec 2023 16:33:23 +0300
Subject: [PATCH 118/202] pageserver: Reduce tracing overhead in timeline::get
 (#6115)

## Problem

Compaction process (specifically the image layer reconstructions part)
is lagging behind wal ingest (at speed ~10-15MB/s) for medium-sized
tenants (30-50GB). CPU profile shows that significant amount of time
(see flamegraph) is being spent in `tracing::span::Span::new`.

mainline (commit: 0ba4cae491c26c7678f7abddb68bf76134a7df90):

![reconstruct-mainline-0ba4cae491c2](https://github.com/neondatabase/neon/assets/289788/ebfd262e-5c97-4858-80c7-664a1dbcc59d)

## Summary of changes

By lowering the tracing level in get_value_reconstruct_data and
get_or_maybe_download from info to debug, we can reduce the overhead of
span creation in prod environments. On my system, this sped up the image
reconstruction process by 60% (from 14500 to 23160 page reconstruction
per sec)

pr:

![reconstruct-opt-2](https://github.com/neondatabase/neon/assets/289788/563a159b-8f2f-4300-b0a1-6cd66e7df769)


`create_image_layers()` (it's 1 CPU bound here) mainline vs pr:

![image](https://github.com/neondatabase/neon/assets/289788/a981e3cb-6df9-4882-8a94-95e99c35aa83)
---
 pageserver/src/tenant/storage_layer/layer.rs | 160 ++++++++++---------
 test_runner/regress/test_broken_timeline.py  |   4 +-
 2 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a4b102c314..9a8ddc1a6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,8 +259,9 @@ impl Layer {
 
         layer
             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
             .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
     /// Download the layer if evicted.
@@ -654,7 +655,6 @@ impl LayerInner {
     }
 
     /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
     async fn get_or_maybe_download(
         self: &Arc<Self>,
         allow_download: bool,
@@ -663,95 +663,101 @@ impl LayerInner {
         let mut init_permit = None;
 
         loop {
-            let download = move |permit| async move {
-                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| {
+                async move {
+                    // disable any scheduled but not yet running eviction deletions for this
+                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
-                // no need to make the evict_and_wait wait for the actual download to complete
-                drop(self.status.send(Status::Downloaded));
+                    // no need to make the evict_and_wait wait for the actual download to complete
+                    drop(self.status.send(Status::Downloaded));
 
-                let timeline = self
-                    .timeline
-                    .upgrade()
-                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                    let timeline = self
+                        .timeline
+                        .upgrade()
+                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                // FIXME: grab a gate
+                    // FIXME: grab a gate
 
-                let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
-                // check if we really need to be downloaded; could have been already downloaded by a
-                // cancelled previous attempt.
-                let needs_download = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PreStatFailed)?;
+                    // check if we really need to be downloaded; could have been already downloaded by a
+                    // cancelled previous attempt.
+                    let needs_download = self
+                        .needs_download()
+                        .await
+                        .map_err(DownloadError::PreStatFailed)?;
 
-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
+                    let permit = if let Some(reason) = needs_download {
+                        if let NeedsDownload::NotFile(ft) = reason {
+                            return Err(DownloadError::NotFile(ft));
+                        }
+
+                        // only reset this after we've decided we really need to download. otherwise it'd
+                        // be impossible to mark cancelled downloads for eviction, like one could imagine
+                        // we would like to do for prefetching which was not needed.
+                        self.wanted_evicted.store(false, Ordering::Release);
+
+                        if !can_ever_evict {
+                            return Err(DownloadError::NoRemoteStorage);
+                        }
+
+                        if let Some(ctx) = ctx {
+                            self.check_expected_download(ctx)?;
+                        }
+
+                        if !allow_download {
+                            // this does look weird, but for LayerInner the "downloading" means also changing
+                            // internal once related state ...
+                            return Err(DownloadError::DownloadRequired);
+                        }
+
+                        tracing::info!(%reason, "downloading on-demand");
+
+                        self.spawn_download_and_wait(timeline, permit).await?
+                    } else {
+                        // the file is present locally, probably by a previous but cancelled call to
+                        // get_or_maybe_download. alternatively we might be running without remote storage.
+                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                        permit
+                    };
+
+                    let since_last_eviction =
+                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                    if let Some(since_last_eviction) = since_last_eviction {
+                        // FIXME: this will not always be recorded correctly until #6028 (the no
+                        // download needed branch above)
+                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                     }
 
-                    // only reset this after we've decided we really need to download. otherwise it'd
-                    // be impossible to mark cancelled downloads for eviction, like one could imagine
-                    // we would like to do for prefetching which was not needed.
-                    self.wanted_evicted.store(false, Ordering::Release);
+                    let res = Arc::new(DownloadedLayer {
+                        owner: Arc::downgrade(self),
+                        kind: tokio::sync::OnceCell::default(),
+                        version: next_version,
+                    });
 
-                    if !can_ever_evict {
-                        return Err(DownloadError::NoRemoteStorage);
+                    self.access_stats.record_residence_event(
+                        LayerResidenceStatus::Resident,
+                        LayerResidenceEventReason::ResidenceChange,
+                    );
+
+                    let waiters = self.inner.initializer_count();
+                    if waiters > 0 {
+                        tracing::info!(
+                            waiters,
+                            "completing the on-demand download for other tasks"
+                        );
                     }
 
-                    if let Some(ctx) = ctx {
-                        self.check_expected_download(ctx)?;
-                    }
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
 
-                    if !allow_download {
-                        // this does look weird, but for LayerInner the "downloading" means also changing
-                        // internal once related state ...
-                        return Err(DownloadError::DownloadRequired);
-                    }
-
-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
-                } else {
-                    // the file is present locally, probably by a previous but cancelled call to
-                    // get_or_maybe_download. alternatively we might be running without remote storage.
-                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
-
-                let since_last_eviction =
-                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    // FIXME: this will not always be recorded correctly until #6028 (the no
-                    // download needed branch above)
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
-
-                let res = Arc::new(DownloadedLayer {
-                    owner: Arc::downgrade(self),
-                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
-                });
-
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
-
-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
             };
 
             if let Some(init_permit) = init_permit.take() {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 53eeb8bbe9..4da0ba7b20 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*layer loading failed:.*",
+            ".*get_value_reconstruct_data for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="layer loading failed:") as err:
+    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"

From 00d90ce76a230d7afc9994df9fafd688c76ebd57 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 18 Dec 2023 16:04:47 +0100
Subject: [PATCH 119/202] Added cache for get role secret (#6165)

## Problem

Currently if we are getting many consecutive connections to the same
user/ep we will send a lot of traffic to the console.

## Summary of changes

Cache with ttl=4min proxy_get_role_secret response.

Note: this is the temporary hack, notifier listener is WIP.
---
 proxy/src/auth/backend.rs          |  9 ++-------
 proxy/src/bin/proxy.rs             | 16 +++++++++++++--
 proxy/src/config.rs                |  4 ++--
 proxy/src/console/provider.rs      | 13 +++++++++----
 proxy/src/console/provider/mock.rs |  6 +++---
 proxy/src/console/provider/neon.rs | 31 ++++++++++++++++++++++--------
 proxy/src/scram/key.rs             |  2 +-
 proxy/src/scram/secret.rs          |  1 +
 proxy/src/serverless/conn_pool.rs  |  1 -
 9 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3b09e05bd2..0c867dfd61 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,6 @@ use tokio_postgres::config::AuthKeys;
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
@@ -187,17 +186,13 @@ async fn auth_quirks(
     };
 
     info!("fetching user's authentication info");
-    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
-    let AuthInfo {
-        secret,
-        allowed_ips,
-    } = api.get_auth_info(extra, &info).await?;
+    let allowed_ips = api.get_allowed_ips(extra, &info).await?;
 
     // check allowed list
     if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let secret = secret.unwrap_or_else(|| {
+    let secret = api.get_role_secret(extra, &info).await?.unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ae4c42bcb1..be3989d387 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -6,6 +6,7 @@ use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::console::provider::AllowedIpsCache;
 use proxy::console::provider::NodeInfoCache;
+use proxy::console::provider::RoleSecretCache;
 use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
@@ -86,7 +87,7 @@ struct ProxyCliArgs {
     #[clap(long)]
     metric_collection_interval: Option<String>,
     /// cache for `wake_compute` api method (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     wake_compute_cache: String,
     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
@@ -127,8 +128,11 @@ struct ProxyCliArgs {
     #[clap(flatten)]
     aimd_config: proxy::rate_limiter::AimdConfig,
     /// cache for `allowed_ips` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
+    /// cache for `role_secret` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    role_secret_cache: String,
     /// disable ip check for http requests. If it is too time consuming, it could be turned off.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_ip_check_for_http: bool,
@@ -266,9 +270,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         AuthBackend::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
+            let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
+            info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches {
                 node_info: NodeInfoCache::new(
                     "node_info_cache",
@@ -282,6 +288,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                     allowed_ips_cache_config.ttl,
                     false,
                 ),
+                role_secret: RoleSecretCache::new(
+                    "role_secret_cache",
+                    role_secret_cache_config.size,
+                    role_secret_cache_config.ttl,
+                    false,
+                ),
             }));
 
             let config::WakeComputeLockOptions {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f932df4058..2ed248af8d 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -310,10 +310,10 @@ pub struct CacheOptions {
 
 impl CacheOptions {
     /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";
+    pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
 
     /// Parse cache options passed via cmdline.
-    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
     fn parse(options: &str) -> anyhow::Result<Self> {
         let mut size = None;
         let mut ttl = None;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 8d399f26ea..7ef5e950b0 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -10,6 +10,7 @@ use crate::{
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
+use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::{
     sync::{OwnedSemaphorePermit, Semaphore},
@@ -216,6 +217,7 @@ impl ConsoleReqExtra {
 }
 
 /// Auth secret which is managed by the cloud.
+#[derive(Clone)]
 pub enum AuthSecret {
     #[cfg(feature = "testing")]
     /// Md5 hash of user's password.
@@ -250,18 +252,19 @@ pub struct NodeInfo {
 
 pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
 pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
-pub type AllowedIpsCache = TimedLru<Arc<str>, Arc<Vec<String>>>;
+pub type AllowedIpsCache = TimedLru<SmolStr, Arc<Vec<String>>>;
+pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option<AuthSecret>>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
 #[async_trait]
 pub trait Api {
     /// Get the client's auth secret for authentication.
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, errors::GetAuthInfoError>;
+    ) -> Result<Option<AuthSecret>, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
@@ -282,7 +285,9 @@ pub struct ApiCaches {
     /// Cache for the `wake_compute` API method.
     pub node_info: NodeInfoCache,
     /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead.
-    pub allowed_ips: TimedLru<Arc<str>, Arc<Vec<String>>>,
+    pub allowed_ips: AllowedIpsCache,
+    /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead.
+    pub role_secret: RoleSecretCache,
 }
 
 /// Various caches for [`console`](super).
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index c464b4daf2..9c4a7447c6 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -142,12 +142,12 @@ async fn get_execute_postgres_query(
 #[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(creds).await
+    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
+        Ok(self.do_get_auth_info(creds).await?.secret)
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f748c9a41f..5bb91313c4 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -159,12 +159,24 @@ impl Api {
 #[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(extra, creds).await
+    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
+        if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
+            return Ok(role_secret.clone());
+        }
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        self.caches
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret.clone());
+        self.caches
+            .allowed_ips
+            .insert(ep, Arc::new(auth_info.allowed_ips));
+        Ok(auth_info.secret)
     }
 
     async fn get_allowed_ips(
@@ -172,8 +184,7 @@ impl super::Api for Api {
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
-        let key: &str = &creds.endpoint;
-        if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
+        if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -182,10 +193,14 @@ impl super::Api for Api {
         ALLOWED_IPS_BY_CACHE_OUTCOME
             .with_label_values(&["miss"])
             .inc();
-        let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips);
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
         self.caches
-            .allowed_ips
-            .insert(key.into(), allowed_ips.clone());
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret);
+        self.caches.allowed_ips.insert(ep, allowed_ips.clone());
         Ok(allowed_ips)
     }
 
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index e9c65fcef3..bd93fb2b70 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32;
 /// One of the keys derived from the [password](super::password::SaltedPassword).
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
-#[derive(Default, PartialEq, Eq)]
+#[derive(Clone, Default, PartialEq, Eq)]
 #[repr(transparent)]
 pub struct ScramKey {
     bytes: [u8; SCRAM_KEY_LEN],
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 424beccec9..9e74e07af1 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -5,6 +5,7 @@ use super::key::ScramKey;
 
 /// Server secret is produced from [password](super::password::SaltedPassword)
 /// and is used throughout the authentication process.
+#[derive(Clone)]
 pub struct ServerSecret {
     /// Number of iterations for `PBKDF2` function.
     pub iterations: u32,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index ab8903418b..df2d1bea32 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -431,7 +431,6 @@ async fn connect_to_compute(
         application_name: APP_NAME.to_string(),
         options: console_options,
     };
-    // TODO(anna): this is a bit hacky way, consider using console notification listener.
     if !config.disable_ip_check_for_http {
         let allowed_ips = backend.get_allowed_ips(&extra).await?;
         if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {

From 4ea4812ab2a9909cae30562e2bf6e1dd02b79691 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Dec 2023 15:47:09 +0000
Subject: [PATCH 120/202] tests: update python dependencies (#6164)

## Problem

Existing dependencies didn't work on Fedora 39 (python 3.12)

## Summary of changes

- Update pyyaml 6.0 -> 6.0.1
- Update yarl 1.8.2->1.9.4
- Update the `dnf install` line in README to include dependencies of
python packages (unrelated to upgrades, just noticed absences while
doing fresh pysync run)
---
 README.md   |   5 +-
 poetry.lock | 258 +++++++++++++++++++++++++++++-----------------------
 2 files changed, 145 insertions(+), 118 deletions(-)

diff --git a/README.md b/README.md
index 3e3123f5ee..98af1edee6 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,14 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
+libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
+  libffi-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
diff --git a/poetry.lock b/poetry.lock
index 8583a71f85..76dfd6d37d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2092,51 +2092,61 @@ files = [
 
 [[package]]
 name = "pyyaml"
-version = "6.0"
+version = "6.0.1"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
-    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
-    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
-    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
-    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
-    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
-    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
-    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
-    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
-    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
-    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
-    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
-    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
-    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
-    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
-    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
+    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
+    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
 [[package]]
@@ -2553,85 +2563,101 @@ files = [
 
 [[package]]
 name = "yarl"
-version = "1.8.2"
+version = "1.9.4"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
-    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
-    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
-    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
-    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
-    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
-    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
-    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
-    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
-    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
-    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
-    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
-    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
+    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
+    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
+    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
+    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
+    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
+    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
+    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
+    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
+    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
+    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
+    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
+    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
+    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
+    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
+    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
 ]
 
 [package.dependencies]

From 1f9a7d1cd0a94a7c539c4fc9ff194d4fdf2917c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Dec 2023 19:17:19 +0100
Subject: [PATCH 121/202] add a Rust client for Pageserver page_service (#6128)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771

Stacked atop https://github.com/neondatabase/neon/pull/6145
---
 Cargo.lock                            |   9 ++
 libs/pageserver_api/src/models.rs     |  93 +++++++++++++++-
 pageserver/Cargo.toml                 |   1 +
 pageserver/client/Cargo.toml          |   8 ++
 pageserver/client/src/lib.rs          |   1 +
 pageserver/client/src/page_service.rs | 151 ++++++++++++++++++++++++++
 6 files changed, 257 insertions(+), 6 deletions(-)
 create mode 100644 pageserver/client/src/page_service.rs

diff --git a/Cargo.lock b/Cargo.lock
index f931fd6c29..9a367effbb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3145,6 +3145,7 @@ dependencies = [
  "tokio",
  "tokio-io-timeout",
  "tokio-postgres",
+ "tokio-stream",
  "tokio-tar",
  "tokio-util",
  "toml_edit",
@@ -3182,11 +3183,19 @@ dependencies = [
 name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "async-trait",
+ "bytes",
+ "futures",
  "pageserver_api",
+ "postgres",
  "reqwest",
  "serde",
  "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
  "utils",
  "workspace_hack",
 ]
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a78ba8ad94..0f5e202249 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,6 +2,7 @@ pub mod partitioning;
 
 use std::{
     collections::HashMap,
+    io::Read,
     num::{NonZeroU64, NonZeroUsize},
     time::SystemTime,
 };
@@ -19,7 +20,7 @@ use utils::{
 
 use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 /// The state of a tenant in this pageserver.
 ///
@@ -576,6 +577,7 @@ pub enum PagestreamFeMessage {
 }
 
 // Wrapped in libpq CopyData
+#[derive(strum_macros::EnumProperty)]
 pub enum PagestreamBeMessage {
     Exists(PagestreamExistsResponse),
     Nblocks(PagestreamNblocksResponse),
@@ -584,6 +586,29 @@ pub enum PagestreamBeMessage {
     DbSize(PagestreamDbSizeResponse),
 }
 
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamBeMessageTag {
+    Exists = 100,
+    Nblocks = 101,
+    GetPage = 102,
+    Error = 103,
+    DbSize = 104,
+}
+impl TryFrom<u8> for PagestreamBeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            100 => Ok(PagestreamBeMessageTag::Exists),
+            101 => Ok(PagestreamBeMessageTag::Nblocks),
+            102 => Ok(PagestreamBeMessageTag::GetPage),
+            103 => Ok(PagestreamBeMessageTag::Error),
+            104 => Ok(PagestreamBeMessageTag::DbSize),
+            _ => Err(value),
+        }
+    }
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
     pub latest: bool,
@@ -739,35 +764,91 @@ impl PagestreamBeMessage {
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
+        use PagestreamBeMessageTag as Tag;
         match self {
             Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Exists as u8);
                 bytes.put_u8(resp.exists as u8);
             }
 
             Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Nblocks as u8);
                 bytes.put_u32(resp.n_blocks);
             }
 
             Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::GetPage as u8);
                 bytes.put(&resp.page[..]);
             }
 
             Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Error as u8);
                 bytes.put(resp.message.as_bytes());
                 bytes.put_u8(0); // null terminator
             }
             Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::DbSize as u8);
                 bytes.put_i64(resp.db_size);
             }
         }
 
         bytes.into()
     }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+
+        use PagestreamBeMessageTag as Tag;
+        let ok =
+            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
+                Tag::Exists => {
+                    let exists = buf.read_u8()?;
+                    Self::Exists(PagestreamExistsResponse {
+                        exists: exists != 0,
+                    })
+                }
+                Tag::Nblocks => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
+                }
+                Tag::GetPage => {
+                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
+                    buf.read_exact(&mut page)?;
+                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
+                }
+                Tag::Error => {
+                    let buf = buf.get_ref();
+                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                    let rust_str = cstr.to_str()?;
+                    PagestreamBeMessage::Error(PagestreamErrorResponse {
+                        message: rust_str.to_owned(),
+                    })
+                }
+                Tag::DbSize => {
+                    let db_size = buf.read_i64::<BigEndian>()?;
+                    Self::DbSize(PagestreamDbSizeResponse { db_size })
+                }
+            };
+        let remaining = buf.into_inner();
+        if !remaining.is_empty() {
+            anyhow::bail!(
+                "remaining bytes in msg with tag={msg_tag}: {}",
+                remaining.len()
+            );
+        }
+        Ok(ok)
+    }
+
+    pub fn kind(&self) -> &'static str {
+        match self {
+            Self::Exists(_) => "Exists",
+            Self::Nblocks(_) => "Nblocks",
+            Self::GetPage(_) => "GetPage",
+            Self::Error(_) => "Error",
+            Self::DbSize(_) => "DbSize",
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9e8172c6a1..980fbab22e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -63,6 +63,7 @@ thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index 4bd36185a6..0ed27602cd 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -12,3 +12,11 @@ reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
index 3963fd466c..4a3f4dea47 100644
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1 +1,2 @@
 pub mod mgmt_api;
+pub mod page_service;
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
new file mode 100644
index 0000000000..fc0d2311f7
--- /dev/null
+++ b/pageserver/client/src/page_service.rs
@@ -0,0 +1,151 @@
+use std::pin::Pin;
+
+use futures::SinkExt;
+use pageserver_api::{
+    models::{
+        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
+        PagestreamGetPageResponse,
+    },
+    reltag::RelTag,
+};
+use tokio::task::JoinHandle;
+use tokio_postgres::CopyOutStream;
+use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+pub struct Client {
+    client: tokio_postgres::Client,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct BasebackupRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub lsn: Option<Lsn>,
+    pub gzip: bool,
+}
+
+impl Client {
+    pub async fn new(connstring: String) -> anyhow::Result<Self> {
+        let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
+
+        let conn_task_cancel = CancellationToken::new();
+        let conn_task = tokio::spawn({
+            let conn_task_cancel = conn_task_cancel.clone();
+            async move {
+                tokio::select! {
+                    _ = conn_task_cancel.cancelled() => { }
+                    res = connection => {
+                        res.unwrap();
+                    }
+                }
+            }
+        });
+        Ok(Self {
+            cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
+            conn_task,
+            client,
+        })
+    }
+
+    pub async fn pagestream(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<PagestreamClient> {
+        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
+            .client
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .await?;
+        let Client {
+            cancel_on_client_drop,
+            conn_task,
+            client: _,
+        } = self;
+        Ok(PagestreamClient {
+            copy_both: Box::pin(copy_both),
+            conn_task,
+            cancel_on_client_drop,
+        })
+    }
+
+    pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
+        let BasebackupRequest {
+            tenant_id,
+            timeline_id,
+            lsn,
+            gzip,
+        } = req;
+        let mut args = Vec::with_capacity(5);
+        args.push("basebackup".to_string());
+        args.push(format!("{tenant_id}"));
+        args.push(format!("{timeline_id}"));
+        if let Some(lsn) = lsn {
+            args.push(format!("{lsn}"));
+        }
+        if *gzip {
+            args.push("--gzip".to_string())
+        }
+        Ok(self.client.copy_out(&args.join(" ")).await?)
+    }
+}
+
+/// Create using [`Client::pagestream`].
+pub struct PagestreamClient {
+    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct RelTagBlockNo {
+    pub rel_tag: RelTag,
+    pub block_no: u32,
+}
+
+impl PagestreamClient {
+    pub async fn shutdown(mut self) {
+        let _ = self.cancel_on_client_drop.take();
+        self.conn_task.await.unwrap();
+    }
+
+    pub async fn getpage(
+        &mut self,
+        key: RelTagBlockNo,
+        lsn: Lsn,
+    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
+        let req = PagestreamFeMessage::GetPage(req);
+        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));
+
+        self.copy_both.send_all(&mut req).await?;
+
+        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+        let next: bytes::Bytes = next.unwrap()?;
+
+        let msg = PagestreamBeMessage::deserialize(next)?;
+        match msg {
+            PagestreamBeMessage::GetPage(p) => Ok(p),
+            PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
+            PagestreamBeMessage::Exists(_)
+            | PagestreamBeMessage::Nblocks(_)
+            | PagestreamBeMessage::DbSize(_) => {
+                anyhow::bail!(
+                    "unexpected be message kind in response to getpage request: {}",
+                    msg.kind()
+                )
+            }
+        }
+    }
+}

From 62737f37767db150383397b99a52e16217e40e4a Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 18 Dec 2023 10:05:39 -0800
Subject: [PATCH 122/202] Grant BYPASSRLS and REPLICATION explicitly to
 neon_superuser roles

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba1ee6d1b2..20299c8fde 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                 // from neon_superuser.
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);

From 82215d20b01c82eb8dd3aebef724854008cdcdb3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 18 Dec 2023 21:05:24 +0200
Subject: [PATCH 123/202] Mark some variables 'static'

Move initialization of neon_redo_read_buffer_filter. This allows
marking it 'static', too.
---
 pgxn/neon/libpagestore.c   |  9 ++-------
 pgxn/neon/neon.h           |  7 -------
 pgxn/neon/pagestore_smgr.c | 15 +++++++++++----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 16406ce8a3..5056a3c5ff 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -17,7 +17,6 @@
 #include "pagestore_client.h"
 #include "fmgr.h"
 #include "access/xlog.h"
-#include "access/xlogutils.h"
 #include "storage/buf_internals.h"
 #include "storage/lwlock.h"
 #include "storage/ipc.h"
@@ -62,8 +61,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int			n_reconnect_attempts = 0;
-int			max_reconnect_attempts = 60;
+static int n_reconnect_attempts = 0;
+static int max_reconnect_attempts = 60;
 
 #define MAX_PAGESERVER_CONNSTRING_SIZE 256
 
@@ -83,8 +82,6 @@ static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
-
 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
 
@@ -627,8 +624,6 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
-		old_redo_read_buffer_filter = redo_read_buffer_filter;
-		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 
 	lfc_init();
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 897a8373a1..c3afecc679 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -27,13 +27,6 @@ extern void pg_init_walproposer(void);
 
 extern void pg_init_extension_server(void);
 
-/*
- * Returns true if we shouldn't do REDO on that block in record indicated by
- * block_id; false otherwise.
- */
-extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 609d80588c..99e6583ab2 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -47,9 +47,10 @@
 
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlogdefs.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
+#include "access/xlogutils.h"
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
@@ -106,6 +107,9 @@ typedef enum
 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
+static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 /*
  * Prefetch implementation:
  *
@@ -239,7 +243,7 @@ typedef struct PrefetchState
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
-PrefetchState *MyPState;
+static PrefetchState *MyPState;
 
 #define GetPrfSlot(ring_index) ( \
 	( \
@@ -257,7 +261,7 @@ PrefetchState *MyPState;
 	) \
 )
 
-XLogRecPtr	prefetch_lsn = 0;
+static XLogRecPtr prefetch_lsn = 0;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
@@ -1371,6 +1375,9 @@ neon_init(void)
 	MyPState->prf_hash = prfh_create(MyPState->hashctx,
 									 readahead_buffer_size, NULL);
 
+	old_redo_read_buffer_filter = redo_read_buffer_filter;
+	redo_read_buffer_filter = neon_redo_read_buffer_filter;
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -2869,7 +2876,7 @@ get_fsm_physical_block(BlockNumber heapblk)
  * contents, where with REDO locking it would wait on block 1 and see
  * block 3 with post-REDO contents only.
  */
-bool
+static bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;

From c4c48cfd6344ece6ef9669db388ae17de3d0972e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 18 Dec 2023 21:05:29 +0200
Subject: [PATCH 124/202] Clean up #includes

- No need to include c.h, port.h or pg_config.h, they are included in
  postgres.h
- No need to include postgres.h in header files. Instead, the
  assumption in PostgreSQL is that all .c files include postgres.h.
- Reorder includes to alphabetical order, and system headers before
  pgsql headers
- Remove bunch of other unnecessary includes that got copy-pasted from
  one source file to another
---
 libs/walproposer/bindgen_deps.h     |  1 +
 pgxn/neon/control_plane_connector.c | 19 ++++++++++---------
 pgxn/neon/extension_server.c        | 15 ++-------------
 pgxn/neon/file_cache.c              | 16 +++++++---------
 pgxn/neon/libpagestore.c            | 21 +++++++++------------
 pgxn/neon/neon_utils.c              | 27 +--------------------------
 pgxn/neon/neon_utils.h              |  2 --
 pgxn/neon/pagestore_client.h        |  7 ++-----
 pgxn/neon/pagestore_smgr.c          | 10 +++++-----
 pgxn/neon/walproposer.h             |  8 +++-----
 pgxn/neon/walproposer_compat.c      |  6 ++++--
 11 files changed, 44 insertions(+), 88 deletions(-)

diff --git a/libs/walproposer/bindgen_deps.h b/libs/walproposer/bindgen_deps.h
index b95788347c..41ee1cd4a3 100644
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1 +1,2 @@
+#include "postgres.h"
 #include "walproposer.h"
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 2e7da671f9..e467a9c43a 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -19,20 +19,21 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+
+#include <curl/curl.h>
+
+#include "access/xact.h"
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "libpq/crypt.h"
+#include "miscadmin.h"
 #include "tcop/pquery.h"
 #include "tcop/utility.h"
-#include "access/xact.h"
+#include "utils/acl.h"
+#include "utils/guc.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include <curl/curl.h>
 #include "utils/jsonb.h"
-#include "libpq/crypt.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index fbbb8fd448..d9a75142f1 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -1,4 +1,3 @@
-
 /*-------------------------------------------------------------------------
  *
  * extension_server.c
@@ -10,21 +9,11 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
-#include "tcop/pquery.h"
-#include "tcop/utility.h"
-#include "access/xact.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include "fmgr.h"
 
 #include <curl/curl.h>
 
+#include "utils/guc.h"
+
 static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 54b3661e66..53258f4e49 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -13,32 +13,30 @@
  *-------------------------------------------------------------------------
  */
 
+#include "postgres.h"
+
 #include <sys/file.h>
 #include <unistd.h>
 #include <fcntl.h>
 
-#include "postgres.h"
-
 #include "neon_pgversioncompat.h"
 
+#include "access/parallel.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pgstat.h"
 #include "pagestore_client.h"
-#include "access/parallel.h"
+#include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
-#include "storage/latch.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/latch.h"
 #include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/builtins.h"
 #include "utils/dynahash.h"
 #include "utils/guc.h"
-#include "storage/fd.h"
-#include "storage/pg_shmem.h"
-#include "storage/buf_internals.h"
-#include "pgstat.h"
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5056a3c5ff..3b038f906f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -14,27 +14,24 @@
  */
 #include "postgres.h"
 
-#include "pagestore_client.h"
-#include "fmgr.h"
 #include "access/xlog.h"
-#include "storage/buf_internals.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
-#include "storage/pg_shmem.h"
-#include "c.h"
-#include "postmaster/interrupt.h"
-
+#include "fmgr.h"
 #include "libpq-fe.h"
-#include "libpq/pqformat.h"
 #include "libpq/libpq.h"
-
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/interrupt.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/guc.h"
 
 #include "neon.h"
-#include "walproposer.h"
 #include "neon_utils.h"
+#include "pagestore_client.h"
+#include "walproposer.h"
 
 #define PageStoreTrace DEBUG5
 
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 807d2decf6..9135847aaf 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -3,33 +3,8 @@
 
 #include "postgres.h"
 
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
+#include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif
 
 /*
  * Convert a character which represents a hexadecimal digit to an integer.
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 20745d8b26..a86f1e061c 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,8 +1,6 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__
 
-#include "postgres.h"
-
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index ecfadb01d6..225959ef64 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -13,19 +13,16 @@
 #ifndef pageserver_h
 #define pageserver_h
 
-#include "postgres.h"
 #include "neon_pgversioncompat.h"
 
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
-#include "storage/block.h"
-#include "storage/smgr.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
 #include "utils/memutils.h"
 
-#include "pg_config.h"
-
 typedef enum
 {
 	/* pagestore_client -> pagestore */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 99e6583ab2..8888cd89c6 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -54,19 +54,19 @@
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
-#include "pagestore_client.h"
-#include "postmaster/interrupt.h"
+#include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/interrupt.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 #include "storage/fsm_internals.h"
-#include "storage/smgr.h"
 #include "storage/md.h"
-#include "pgstat.h"
+#include "storage/smgr.h"
+
+#include "pagestore_client.h"
 
 #if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
 #include "access/xlogrecovery.h"
 #endif
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 615018c58e..6ba2aae75b 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,14 +1,12 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__
 
-#include "postgres.h"
-#include "access/xlogdefs.h"
-#include "port.h"
-#include "access/xlog_internal.h"
 #include "access/transam.h"
+#include "access/xlogdefs.h"
+#include "access/xlog_internal.h"
 #include "nodes/replnodes.h"
-#include "utils/uuid.h"
 #include "replication/walreceiver.h"
+#include "utils/uuid.h"
 
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c
index 04b519ab15..35d984c52e 100644
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -3,11 +3,13 @@
  * This is needed to avoid linking to full postgres server installation. This file
  * is compiled as a part of libwalproposer static library.
  */
+#include "postgres.h"
 
 #include <stdio.h>
-#include "walproposer.h"
-#include "utils/datetime.h"
+
 #include "miscadmin.h"
+#include "utils/datetime.h"
+#include "walproposer.h"
 
 void
 ExceptionalCondition(const char *conditionName,

From 6939fc3db6d18569ae2d11fca4abdef1689841d4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 18 Dec 2023 21:05:31 +0200
Subject: [PATCH 125/202] Remove declarations of non-existent global variables
 and functions

FileCacheMonitorMain was removed in commit b497d0094e.
---
 pgxn/neon/file_cache.c       | 2 --
 pgxn/neon/pagestore_client.h | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 53258f4e49..6725ce8fff 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -100,8 +100,6 @@ static shmem_request_hook_type prev_shmem_request_hook;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
-void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
-
 /*
  * Local file cache is optional and Neon can work without it.
  * In case of any any errors with this cache, we should disable it but to not throw error.
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 225959ef64..3fcaab0bee 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -155,11 +155,8 @@ extern page_server_api *page_server;
 extern char *page_server_connstring;
 extern int	flush_every_n_requests;
 extern int	readahead_buffer_size;
-extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
-extern bool wal_redo;
 extern int32 max_cluster_size;
 
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);

From 6e6e40dd7fc81f45166a008e9f8f957f1345a420 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 18 Dec 2023 23:24:22 +0100
Subject: [PATCH 126/202] Invalidate credentials on auth failure (#6171)

## Problem

If the user reset password, cache could receive this information only
after `ttl` minutes.

## Summary of changes

Invalidate password on auth failure.
---
 proxy/src/auth.rs                  |  4 ++++
 proxy/src/auth/backend.rs          | 34 +++++++++++++++++++++++++++++-
 proxy/src/console/provider.rs      |  3 ++-
 proxy/src/console/provider/mock.rs |  7 ++++--
 proxy/src/console/provider/neon.rs | 12 ++++++-----
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index eadb9abd43..64ef108e11 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -87,6 +87,10 @@ impl AuthError {
     pub fn too_many_connections() -> Self {
         AuthErrorImpl::TooManyConnections.into()
     }
+
+    pub fn is_auth_failed(&self) -> bool {
+        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 0c867dfd61..923bd02560 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -192,14 +192,46 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let secret = api.get_role_secret(extra, &info).await?.unwrap_or_else(|| {
+    let cached_secret = api.get_role_secret(extra, &info).await?;
+
+    let secret = cached_secret.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
         info!("authentication info not found, mocking it");
         AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
     });
+    match authenticate_with_secret(
+        secret,
+        info,
+        client,
+        unauthenticated_password,
+        allow_cleartext,
+        config,
+        latency_timer,
+    )
+    .await
+    {
+        Ok(keys) => Ok(keys),
+        Err(e) => {
+            if e.is_auth_failed() {
+                // The password could have been changed, so we invalidate the cache.
+                cached_secret.invalidate();
+            }
+            Err(e)
+        }
+    }
+}
 
+async fn authenticate_with_secret(
+    secret: AuthSecret,
+    info: ComputeUserInfo,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
+    unauthenticated_password: Option<Vec<u8>>,
+    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7ef5e950b0..e4cf1e8c8e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -254,6 +254,7 @@ pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
 pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
 pub type AllowedIpsCache = TimedLru<SmolStr, Arc<Vec<String>>>;
 pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option<AuthSecret>>;
+pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -264,7 +265,7 @@ pub trait Api {
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<Option<AuthSecret>, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 9c4a7447c6..dba5e5863f 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,6 +6,7 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
+use crate::console::provider::CachedRoleSecret;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use async_trait::async_trait;
 use futures::TryFutureExt;
@@ -146,8 +147,10 @@ impl super::Api for Api {
         &self,
         _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
-        Ok(self.do_get_auth_info(creds).await?.secret)
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(creds).await?.secret,
+        ))
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 5bb91313c4..628d98df49 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,7 +3,8 @@
 use super::{
     super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra,
+    NodeInfo,
 };
 use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
@@ -163,20 +164,21 @@ impl super::Api for Api {
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<Option<AuthSecret>, GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let ep = creds.endpoint.clone();
         let user = creds.inner.user.clone();
         if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
-            return Ok(role_secret.clone());
+            return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(extra, creds).await?;
-        self.caches
+        let (_, secret) = self
+            .caches
             .role_secret
             .insert((ep.clone(), user), auth_info.secret.clone());
         self.caches
             .allowed_ips
             .insert(ep, Arc::new(auth_info.allowed_ips));
-        Ok(auth_info.secret)
+        Ok(secret)
     }
 
     async fn get_allowed_ips(

From c272c68e5c0715a55441c1db8235ea57662a8cc0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 11:20:56 +0100
Subject: [PATCH 127/202] RFC: Per-Tenant GetPage@LSN Throttling (#5648)

Implementation epic: https://github.com/neondatabase/neon/issues/5899
---
 docs/rfcs/029-getpage-throttling.md | 197 ++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 docs/rfcs/029-getpage-throttling.md

diff --git a/docs/rfcs/029-getpage-throttling.md b/docs/rfcs/029-getpage-throttling.md
new file mode 100644
index 0000000000..b4f9adefc5
--- /dev/null
+++ b/docs/rfcs/029-getpage-throttling.md
@@ -0,0 +1,197 @@
+# Per-Tenant GetPage@LSN Throttling
+
+Author: Christian Schwarz
+Date: Oct 24, 2023
+
+## Summary
+
+This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
+and the interactions with its client, i.e., the neon_smgr component in Compute.
+
+The result of implementing & executing this RFC will be a fleet-wide upper limit for
+**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
+
+## Background
+
+### GetPage@LSN Request Flow
+
+Pageserver exposes its `page_service.rs` as a libpq listener.
+The Computes' `neon_smgr` module connects to that libpq listener.
+Once a connection is established, the protocol allows Compute to request page images at a given LSN.
+We call these requests GetPage@LSN requests, or GetPage requests for short.
+Other request types can be sent, but these are low traffic compared to GetPage requests
+and are not the concern of this RFC.
+
+Pageserver associates one libpq connection with one tokio task.
+
+Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
+Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
+Requests are processed in the order in which they arrive via the TCP-based pq protocol.
+So, there is no concurrent request processing within one connection/task.
+
+There is a degree of natural pipelining:
+Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
+And Pageserver can fill the pipe with responses in the other direction.
+Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
+
+### GetPage@LSN Access Pattern
+
+The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
+Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
+
+If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
+
+## Motivation
+
+In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
+then dropping to ca 18k GetPage/second for a period of 9h.
+
+We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
+the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
+
+Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
+
+However, here are some illustrative data points for the 155k period:
+The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
+We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
+The CPU utilization of the instance was 75% user+system.
+Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
+The hit rate for materialized pages was ca. 40%.
+Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
+
+The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
+The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
+The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
+
+My claim is that it was **unhealthy to serve this workload at the pace we did**:
+* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
+* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
+    * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
+      At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
+      The result is an **uneven** performance profile from the Compute perspective.
+
+    * **economics**: Neon currently does not charge for IOPS, only capacity.
+      **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
+      For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
+      Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
+      We charge 0$.
+      It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
+
+
+Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
+vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
+So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
+
+## Solution: Throttling GetPage Requests
+
+**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
+That is, unless we want to start charging for provisioned GetPage@LSN/second.
+Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
+Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
+
+## The Design Space
+
+What that remains is the question about *policy* and *mechanism*:
+
+**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
+Candidates are:
+
+* hard limit, same limit value per connection|timeline|tenant
+    * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
+      This is a major operational pain point / risk right now.
+* hard limit, configurable per connection|timeline|tenant
+    * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
+    * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
+* fair share among active connections|timelines|tenants per instance
+    * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
+    * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
+* ...
+
+
+Regarding **mechanism**, it's clear that **backpressure** is the way to go.
+However, we must choose between
+* **implicit** backpressure through pq/TCP and
+* **explicit** rejection of requests + retries with exponential backoff
+
+Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
+where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
+
+And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
+how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
+
+## Scope Of This RFC
+
+**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
+
+This proposal is easy to implement and significantly de-risks operating large Pageservers,
+based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
+
+For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
+Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
+If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
+
+The mechanism for backpressure will be TCP-based implicit backpressure.
+The compute team isn't concerned about prefetch queue depth.
+Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
+
+The rate limit will be implemented using a per-tenant token bucket.
+The bucket will be be shared among all connections to the tenant.
+The bucket implementation supports starvation-preventing `await`ing.
+The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
+The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
+can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
+A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
+
+Regarding metrics / the internal GetPage latency SLO:
+we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
+- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
+- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
+
+Further observability measures:
+- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
+  The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
+
+Rollout will happen as follows:
+- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
+- experimentation in staging and later production to study impact & interaction with auto-scaling
+- determination of a sensible global default value
+  - the value will be chosen as high as possible ...
+  - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
+- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
+- reset of the experimental per-tenant overrides
+- gain experience & lower the limit over time
+  - we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
+    once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
+
+The per-tenant override will remain for emergencies and testing.
+But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
+
+Toward the upper layers of the Neon stack, the resulting limit will be
+**"the highest GetPage/second that Pageserver can support for a single tenant"**.
+
+### Rationale
+
+We decided against error + retry because of worries about starvation.
+
+## Future Work
+
+Enable per-tenant emergency override of the limit via Console.
+Should be part of a more general framework to specify tenant config overrides.
+**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
+or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
+concerns itself with GetPage/second capacity planning.
+
+Compute-side metrics for GetPage latency.
+
+Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
+
+Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
+
+Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
+Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
+With per-tenant rate limiting, we will not meet that expectation.
+However, we can currently only scale per tenant.
+Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
+But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
+To solve this properly, I think we'll need replicas for tenants / shard.
+To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.

From a89d6dc76e8406ad15e45b190bc687b8b208c3e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 11:29:16 +0100
Subject: [PATCH 128/202] Always send a json response for
 timeline_get_lsn_by_timestamp (#6178)

As part of the transition laid out in
[this](https://github.com/neondatabase/cloud/pull/7553#discussion_r1370473911)
comment, don't read the `version` query parameter in
`timeline_get_lsn_by_timestamp`, but always return the structured json
response.

Follow-up of https://github.com/neondatabase/neon/pull/5608
---
 pageserver/src/http/routes.rs           | 37 ++++---------
 test_runner/fixtures/pageserver/http.py | 12 ++++-
 test_runner/regress/test_lsn_mapping.py | 71 ++-----------------------
 3 files changed, 24 insertions(+), 96 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 601fad5bde..bc8b677f77 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -592,8 +592,6 @@ async fn get_lsn_by_timestamp_handler(
         )));
     }
 
-    let version: Option<u8> = parse_query_param(&request, "version")?;
-
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let timestamp_raw = must_get_query_param(&request, "timestamp")?;
     let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -606,31 +604,18 @@ async fn get_lsn_by_timestamp_handler(
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
-
-    if version.unwrap_or(0) > 1 {
-        #[derive(serde::Serialize)]
-        struct Result {
-            lsn: Lsn,
-            kind: &'static str,
-        }
-        let (lsn, kind) = match result {
-            LsnForTimestamp::Present(lsn) => (lsn, "present"),
-            LsnForTimestamp::Future(lsn) => (lsn, "future"),
-            LsnForTimestamp::Past(lsn) => (lsn, "past"),
-            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-        };
-        json_response(StatusCode::OK, Result { lsn, kind })
-    } else {
-        // FIXME: this is a temporary crutch not to break backwards compatibility
-        // See https://github.com/neondatabase/neon/pull/5608
-        let result = match result {
-            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-            LsnForTimestamp::Future(_lsn) => "future".into(),
-            LsnForTimestamp::Past(_lsn) => "past".into(),
-            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        };
-        json_response(StatusCode::OK, result)
+    #[derive(serde::Serialize)]
+    struct Result {
+        lsn: Lsn,
+        kind: &'static str,
     }
+    let (lsn, kind) = match result {
+        LsnForTimestamp::Present(lsn) => (lsn, "present"),
+        LsnForTimestamp::Future(lsn) => (lsn, "future"),
+        LsnForTimestamp::Past(lsn) => (lsn, "past"),
+        LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+    };
+    json_response(StatusCode::OK, Result { lsn, kind })
 }
 
 async fn get_timestamp_of_lsn_handler(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index eda8813c36..add6c4288a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -510,13 +510,21 @@ class PageserverHttpClient(requests.Session):
         assert res_json is None
 
     def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        timestamp,
+        version: Optional[int] = None,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
+        if version is None:
+            version_str = ""
+        else:
+            version_str = f"&version={version}"
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index f79c1c347c..65d6d7a9fd 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -8,71 +8,6 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
-#
-# Test pageserver get_lsn_by_timestamp API
-#
-def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-
-    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
-    log.info("postgres is running on 'test_lsn_mapping' branch")
-
-    cur = endpoint_main.connect().cursor()
-    # Create table, and insert rows, each in a separate transaction
-    # Disable synchronous_commit to make this initialization go faster.
-    #
-    # Each row contains current insert LSN and the current timestamp, when
-    # the row was inserted.
-    cur.execute("SET synchronous_commit=off")
-    cur.execute("CREATE TABLE foo (x integer)")
-    tbl = []
-    for i in range(1000):
-        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
-        # Get the timestamp at UTC
-        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
-        tbl.append([i, after_timestamp])
-
-    # Execute one more transaction with synchronous_commit enabled, to flush
-    # all the previous transactions
-    cur.execute("SET synchronous_commit=on")
-    cur.execute("INSERT INTO foo VALUES (-1)")
-
-    # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
-
-    with env.pageserver.http_client() as client:
-        # Check edge cases: timestamp in the future
-        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
-        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-        )
-        assert result == "future"
-
-        # timestamp too the far history
-        probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-        )
-        assert result == "past"
-
-        # Probe a bunch of timestamps in the valid range
-        for i in range(1, len(tbl), 100):
-            probe_timestamp = tbl[i][1]
-            lsn = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-            )
-            # Call get_lsn_by_timestamp to get the LSN
-            # Launch a new read-only node at that LSN, and check that only the rows
-            # that were supposed to be committed at that point in time are visible.
-            endpoint_here = env.endpoints.create_start(
-                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
-            )
-            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
-
-            endpoint_here.stop_and_destroy()
-
-
 #
 # Test pageserver get_lsn_by_timestamp API
 #
@@ -130,7 +65,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "future"
         # make sure that we return a well advanced lsn here
@@ -139,7 +74,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
@@ -149,7 +84,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
             result = client.timeline_get_lsn_by_timestamp(
-                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
             )
             assert result["kind"] not in ["past", "nodata"]
             lsn = result["lsn"]

From fbb979d5e34d1a2aed6578faba72d3b6cad60366 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 11:29:50 +0100
Subject: [PATCH 129/202] remote_storage: move shared utilities for S3 and
 Azure into common module (#6176)

The PR does two things:

* move the util functions present in the remote_storage Azure and S3
test files into a shared one, deduplicating them.
* add a `s3_upload_download_works` test as a copy of the Azure test

The goal is mainly to fight duplication and make the code a little bit
more generic (like removing mentions of s3 and azure from function
names).

This is a first step towards #6146.
---
 libs/remote_storage/tests/common/mod.rs      | 200 +++++++++++++++
 libs/remote_storage/tests/test_real_azure.rs | 219 ++--------------
 libs/remote_storage/tests/test_real_s3.rs    | 253 +++++--------------
 3 files changed, 288 insertions(+), 384 deletions(-)
 create mode 100644 libs/remote_storage/tests/common/mod.rs

diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
new file mode 100644
index 0000000000..bca117ed1a
--- /dev/null
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -0,0 +1,200 @@
+use std::collections::HashSet;
+use std::ops::ControlFlow;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::Context;
+use bytes::Bytes;
+use camino::Utf8Path;
+use futures::stream::Stream;
+use once_cell::sync::OnceCell;
+use remote_storage::{Download, GenericRemoteStorage, RemotePath};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
+pub(crate) fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+pub(crate) fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
+
+pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
+    let mut buf = Vec::new();
+    tokio::io::copy_buf(
+        &mut tokio_util::io::StreamReader::new(dl.download_stream),
+        &mut buf,
+    )
+    .await?;
+    Ok(buf)
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+pub(crate) async fn upload_simple_remote_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} remote files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
+
+pub(crate) async fn cleanup(
+    client: &Arc<GenericRemoteStorage>,
+    objects_to_delete: HashSet<RemotePath>,
+) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+pub(crate) struct Uploads {
+    pub(crate) prefixes: HashSet<RemotePath>,
+    pub(crate) blobs: HashSet<RemotePath>,
+}
+
+pub(crate) async fn upload_remote_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} remote files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+pub(crate) fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+    });
+}
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 7327803198..0387dc30e7 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
 use remote_storage::{
-    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
 use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
+use tracing::{debug, info};
 
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+mod common;
+
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};
 
 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
 
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
 /// where
 /// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -97,7 +97,7 @@ async fn azure_pagination_should_work(
 /// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `Azure_pagination_should_work` for more information.
 ///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -218,18 +218,9 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
 
     ctx.client.upload(data, len, &path, None).await?;
 
-    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
-        let mut buf = Vec::new();
-        tokio::io::copy_buf(
-            &mut tokio_util::io::StreamReader::new(dl.download_stream),
-            &mut buf,
-        )
-        .await?;
-        Ok(buf)
-    }
     // Normal download request
     let dl = ctx.client.download(&path).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // Full range (end specified)
@@ -237,12 +228,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
         .client
         .download_byte_range(&path, 0, Some(len as u64))
         .await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // partial range (end specified)
     let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..10]);
 
     // partial range (end beyond real end)
@@ -250,17 +241,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
         .client
         .download_byte_range(&path, 8, Some(len as u64 * 100))
         .await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[8..]);
 
     // Partial range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..]);
 
     // Full range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
@@ -272,17 +263,6 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
     Ok(())
 }
 
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
-}
-
 struct EnabledAzure {
     client: Arc<GenericRemoteStorage>,
     base_prefix: &'static str,
@@ -352,7 +332,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
 
         let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -414,7 +394,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
 
         let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -478,166 +458,3 @@ fn create_azure_client(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
-// to binary
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index ecd834e61c..8f46b2abd6 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
+use tracing::{debug, info};
 
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+mod common;
+
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `s3_pagination_should_work` for more information.
 ///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -198,15 +198,65 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
     Ok(())
 }
 
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let MaybeEnabledS3::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
 }
 
 struct EnabledS3 {
@@ -278,7 +328,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
 
         let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -340,7 +390,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 
         let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -403,166 +453,3 @@ fn create_s3_client(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}

From 6ffbbb2e02916246ee17fc40a0d4accb90295bbd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 11:32:51 +0100
Subject: [PATCH 130/202] include timeline ids in tenant details response
 (#6166)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771

This allows getting the list of tenants and timelines without triggering
initial logical size calculation by requesting the timeline details API
response, which would skew our results.
---
 libs/pageserver_api/src/models.rs |  8 ++++++++
 pageserver/client/src/mgmt_api.rs | 12 ++++++++++++
 pageserver/src/http/routes.rs     | 14 +++++++++-----
 pageserver/src/tenant.rs          |  4 ++++
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0f5e202249..be41b610b8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -370,6 +370,14 @@ pub struct TenantInfo {
     pub attachment_status: TenantAttachmentStatus,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantDetails {
+    #[serde(flatten)]
+    pub tenant_info: TenantInfo,
+
+    pub timelines: Vec<TimelineId>,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 77eb1bb8e2..0ad4e1551e 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -64,6 +64,18 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn tenant_details(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<pageserver_api::models::TenantDetails> {
+        let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
+        self.get(uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn list_timelines(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bc8b677f77..e641e44b08 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::TenantDetails;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
@@ -857,11 +858,14 @@ async fn tenant_status(
         }
 
         let state = tenant.current_state();
-        Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_shard_id,
-            state: state.clone(),
-            current_physical_size: Some(current_physical_size),
-            attachment_status: state.attachment_status(),
+        Result::<_, ApiError>::Ok(TenantDetails {
+            tenant_info: TenantInfo {
+                id: tenant_shard_id,
+                state: state.clone(),
+                current_physical_size: Some(current_physical_size),
+                attachment_status: state.attachment_status(),
+            },
+            timelines: tenant.list_timeline_ids(),
         })
     }
     .instrument(info_span!("tenant_status_handler",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eceef6bf78..1d6f1001db 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1552,6 +1552,10 @@ impl Tenant {
             .collect()
     }
 
+    pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
+        self.timelines.lock().unwrap().keys().cloned().collect()
+    }
+
     /// This is used to create the initial 'main' timeline during bootstrapping,
     /// or when importing a new base backup. The caller is expected to load an
     /// initial image of the datadir to the new timeline after this.

From d89af4cf8e7a17077c197a259f2f95a5afdeb2c2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 19 Dec 2023 10:38:00 +0000
Subject: [PATCH 131/202] pageserver: downgrade 'connection reset' WAL errors
 (#6181)

This squashes a particularly noisy warn-level log that occurs when
safekeepers are restarted.

Unfortunately the error type from `tonic` doesn't provide a neat way of
matching this, so we use a string comparison
---
 .../src/tenant/timeline/walreceiver/connection_manager.rs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 7bfa246eeb..5a5b3d7586 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -138,7 +138,7 @@ pub(super) async fn connection_manager_loop_step(
                     Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(status) => {
                         match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => {
                                 // tonic's error handling doesn't provide a clear code for disconnections: we get
                                 // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
                                 info!("broker disconnected: {status}");

From b701394d7ab8aeeadd8221d9280ce5742a9509f4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 19 Dec 2023 02:27:23 +0300
Subject: [PATCH 132/202] Fix WAL waiting in walproposer for v16.

Just preparing cv right before waiting is not enough as we might have already
missed the flushptr change & wakeup, so re-checked before sleep.

https://neondb.slack.com/archives/C03QLRH7PPD/p1702830965396619?thread_ts=1702756761.836649&cid=C03QLRH7PPD
---
 pgxn/neon/walproposer_pg.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 551d56d416..79498b64af 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1482,6 +1482,21 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 #if PG_MAJORVERSION_NUM >= 16
 	if (WalSndCtl != NULL)
 		ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
+
+	/*
+	 * Now that we prepared the condvar, check flush ptr again -- it might have
+	 * changed before we subscribed to cv so we missed the wakeup.
+	 *
+	 * Do that only when we're interested in new WAL: without sync-safekeepers
+	 * and if election already passed.
+	 */
+	if (!wp->config->syncSafekeepers && wp->availableLsn != InvalidXLogRecPtr && GetFlushRecPtr(NULL) > wp->availableLsn)
+	{
+		ConditionVariableCancelSleep();
+		ResetLatch(MyLatch);
+		*events = WL_LATCH_SET;
+		return 1;
+	}
 #endif
 
 	/*

From 73d247c464ad6b72ee5a6d787d12dd93cddbc392 Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Tue, 19 Dec 2023 12:44:25 +0100
Subject: [PATCH 133/202] Analyze clickbench performance with explain plans and
 pg_stat_statements (#6161)

## Problem

To understand differences in performance between neon, aurora and rds we
want to collect explain analyze plans and pg_stat_statements for
selected benchmarking runs

## Summary of changes

Add workflow input options to collect explain and pg_stat_statements for
benchmarking workflow

Co-authored-by: BodoBolero <bodobolero@gmail.com>
---
 .github/workflows/benchmarking.yml        | 29 +++++++++++--
 test_runner/performance/test_perf_olap.py | 51 ++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index e2f15d96db..8bf12c31b1 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,7 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '0 3 * * *' # run once a day, timezone is utc
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
 
   workflow_dispatch: # adds ability to run this manually
     inputs:
@@ -23,6 +23,21 @@ on:
         type: boolean
         description: 'Publish perf report. If not set, the report will be published only for the main branch'
         required: false
+      collect_olap_explain:
+        type: boolean
+        description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
+        required: false
+        default: false
+      collect_pg_stat_statements:
+        type: boolean
+        description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
+        required: false
+        default: false
+      run_AWS_RDS_AND_AURORA:
+        type: boolean
+        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
+        required: false
+        default: false
 
 defaults:
   run:
@@ -113,6 +128,8 @@ jobs:
     # - neon-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+    env:
+      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
     runs-on: ubuntu-latest
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -152,7 +169,7 @@ jobs:
           ]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                    { "platform": "rds-aurora"   }]')
         fi
@@ -171,9 +188,9 @@ jobs:
           ]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -337,6 +354,8 @@ jobs:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
+      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
+      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
@@ -399,6 +418,8 @@ jobs:
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
+        TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         TEST_OLAP_SCALE: 10
 
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 0f7615f7ed..1de7e95bbe 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -17,6 +17,27 @@ class LabelledQuery:
     query: str
 
 
+# This must run before all tests in this module
+# create extension pg_stat_statements if it does not exist
+# and TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false)
+# Theoretically this could be in a module or session scope fixture,
+# however the code depends on other fixtures that have function scope
+@pytest.mark.skipif(
+    os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false",
+    reason="Skipping - Creating extension pg_stat_statements",
+)
+@pytest.mark.remote_cluster
+def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
+    log.info("Creating extension pg_stat_statements")
+    query = LabelledQuery(
+        "Q_CREATE_EXTENSION", r"CREATE EXTENSION IF NOT EXISTS pg_stat_statements;"
+    )
+    run_psql(remote_compare, query, times=1, explain=False)
+    log.info("Reset pg_stat_statements")
+    query = LabelledQuery("Q_RESET", r"SELECT pg_stat_statements_reset();")
+    run_psql(remote_compare, query, times=1, explain=False)
+
+
 # A list of queries to run.
 # Please do not alter the label for the query, as it is used to identify it.
 # Labels for ClickBench queries match the labels in ClickBench reports
@@ -78,6 +99,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
     # fmt: on
 )
 
+EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
+
 
 def get_scale() -> List[str]:
     # We parametrize each tpc-h and clickbench test with scale
@@ -88,7 +111,10 @@ def get_scale() -> List[str]:
     return [scale]
 
 
-def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
+# run the query times times plus once with EXPLAIN VERBOSE if explain is requestd
+def run_psql(
+    env: RemoteCompare, labelled_query: LabelledQuery, times: int, explain: bool = False
+) -> None:
     # prepare connstr:
     # - cut out password from connstr to pass it via env
     # - add options to connstr
@@ -108,6 +134,13 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N
         log.info(f"Run {run}/{times}")
         with env.zenbenchmark.record_duration(f"{label}/{run}"):
             env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)
+    if explain:
+        log.info(f"Explaining query {label}")
+        run += 1
+        with env.zenbenchmark.record_duration(f"{label}/EXPLAIN"):
+            env.pg_bin.run_capture(
+                ["psql", connstr, "-c", f"{EXPLAIN_STRING} {query}"], env=environ
+            )
 
 
 @pytest.mark.parametrize("scale", get_scale())
@@ -120,8 +153,9 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
     Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
     The DB prepared manually in advance
     """
+    explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"
 
-    run_psql(remote_compare, query, times=3)
+    run_psql(remote_compare, query, times=3, explain=explain)
 
 
 def tpch_queuies() -> Tuple[ParameterSet, ...]:
@@ -195,3 +229,16 @@ def test_user_examples(remote_compare: RemoteCompare):
         """,
     )
     run_psql(remote_compare, query, times=3)
+
+
+# This must run after all tests in this module
+# Collect pg_stat_statements after running the tests if TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false)
+@pytest.mark.skipif(
+    os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false",
+    reason="Skipping - Collecting pg_stat_statements",
+)
+@pytest.mark.remote_cluster
+def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
+    log.info("Collecting pg_stat_statements")
+    query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
+    run_psql(remote_compare, query, times=1, explain=False)

From c52384752e3fa1ef89bd4b7aa9a70b173f62dd78 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 19 Dec 2023 14:10:07 +0100
Subject: [PATCH 134/202] Compile `pg_semver` extension (#6184)

Closes #6183
---
 Dockerfile.compute-node | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 03280586f8..a23e930c48 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -569,6 +569,23 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-semver-pg-build"
+# compile pg_semver extension
+#
+#########################################################################################
+FROM build-deps AS pg-semver-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
+    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
+
 #########################################################################################
 #
 # Layer "pg-embedding-pg-build"
@@ -768,6 +785,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/

From a2fab3437161fd80c205fcab61ae88a82bd9cee5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 14:16:53 +0100
Subject: [PATCH 135/202] Update zstd to 0.13 (#6187)

This updates the `zstd` crate to 0.13, and `zstd-sys` with it (it
contains C so we should always run the newest version of that).
---
 Cargo.lock                | 18 ++++++++----------
 compute_tools/Cargo.toml  |  2 +-
 workspace_hack/Cargo.toml |  4 ++--
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9a367effbb..7821b3658a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,9 +190,9 @@ dependencies = [
 
 [[package]]
 name = "async-compression"
-version = "0.4.0"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
+checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
 dependencies = [
  "flate2",
  "futures-core",
@@ -6412,30 +6412,28 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
 
 [[package]]
 name = "zstd"
-version = "0.12.4"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
 dependencies = [
  "zstd-safe",
 ]
 
 [[package]]
 name = "zstd-safe"
-version = "6.0.6"
+version = "7.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
 dependencies = [
- "libc",
  "zstd-sys",
 ]
 
 [[package]]
 name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
+version = "2.0.9+zstd.1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
 dependencies = [
  "cc",
- "libc",
  "pkg-config",
 ]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 47378f1910..18b30810b0 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -37,5 +37,5 @@ workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
-zstd = "0.12.4"
+zstd = "0.13"
 bytes = "1.0"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 4621a75c0b..ffff0fda61 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -75,8 +75,8 @@ tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4"] }
-zstd = { version = "0.12" }
-zstd-safe = { version = "6", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
+zstd = { version = "0.13" }
+zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [build-dependencies]

From e6bf6952b86b4ee757cd64ab187a78cc6715a5cb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 14:46:17 +0100
Subject: [PATCH 136/202] higher resolution histograms for getpage@lsn (#6177)

part of https://github.com/neondatabase/cloud/issues/7811
---
 pageserver/src/metrics.rs | 52 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 45c01b71d1..3f0bc3e0a7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1019,12 +1019,62 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
+    [
+        1,
+        10,
+        20,
+        40,
+        60,
+        80,
+        100,
+        200,
+        300,
+        400,
+        500,
+        600,
+        700,
+        800,
+        900,
+        1_000, // 1ms
+        2_000,
+        4_000,
+        6_000,
+        8_000,
+        10_000, // 10ms
+        20_000,
+        40_000,
+        60_000,
+        80_000,
+        100_000,
+        200_000,
+        400_000,
+        600_000,
+        800_000,
+        1_000_000, // 1s
+        2_000_000,
+        4_000_000,
+        6_000_000,
+        8_000_000,
+        10_000_000, // 10s
+        20_000_000,
+        50_000_000,
+        100_000_000,
+        200_000_000,
+        1_000_000_000, // 1000s
+    ]
+    .into_iter()
+    .map(Duration::from_micros)
+    .map(|d| d.as_secs_f64())
+    .collect()
+});
+
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
         "Time spent on smgr query handling, aggregated by query type.",
         &["smgr_query_type"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
     .expect("failed to define a metric")
 });

From 8b91bbc38ef385d2d7dbbb0ce1ee66b8deec6302 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Dec 2023 16:45:17 +0100
Subject: [PATCH 137/202] Update jsonwebtoken to 9 and sct to 0.7.1 (#6189)

This increases the list of crates that base on `ring` 0.17.
---
 Cargo.lock                | 28 +++++++++++++++-------------
 Cargo.toml                |  2 +-
 workspace_hack/Cargo.toml |  2 +-
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7821b3658a..0e51e88e3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2487,13 +2487,14 @@ dependencies = [
 
 [[package]]
 name = "jsonwebtoken"
-version = "8.3.0"
+version = "9.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
+checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
  "base64 0.21.1",
- "pem 1.1.1",
- "ring 0.16.20",
+ "js-sys",
+ "pem 3.0.3",
+ "ring 0.17.6",
  "serde",
  "serde_json",
  "simple_asn1",
@@ -3291,18 +3292,19 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
 [[package]]
 name = "pem"
-version = "1.1.1"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8"
+checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
 dependencies = [
- "base64 0.13.1",
+ "base64 0.21.1",
+ "serde",
 ]
 
 [[package]]
 name = "pem"
-version = "2.0.1"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
+checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
 dependencies = [
  "base64 0.21.1",
  "serde",
@@ -4428,12 +4430,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
 [[package]]
 name = "sct"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
+ "ring 0.17.6",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index b44544d626..6884de7bf5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -91,7 +91,7 @@ hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
-jsonwebtoken = "8"
+jsonwebtoken = "9"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index ffff0fda61..82bbedc4ae 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -56,7 +56,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
-ring = { version = "0.16", features = ["std"] }
+ring = { version = "0.16" }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }

From 8ff5387da142a0129a884618a5b1a3159a46f544 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 19 Dec 2023 18:17:11 +0200
Subject: [PATCH 138/202] eliminate GCC warning for unchecked result of fread
 (#6167)

## Problem


GCCproduce warning that bread result is not checked. It doesn't affect
program logic, but better live without warnings.

## Summary of changes

Check read result.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/walproposer_pg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 79498b64af..9361f08ad2 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1712,9 +1712,9 @@ walprop_pg_after_election(WalProposer *wp)
 	f = fopen("restart.lsn", "rb");
 	if (f != NULL && !wp->config->syncSafekeepers)
 	{
-		fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+		size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
 		fclose(f);
-		if (lrRestartLsn != InvalidXLogRecPtr)
+		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
 		{
 			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 

From 0bd79eb063282408d6cc424d508bae86ff5dae4c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 19 Dec 2023 16:27:47 +0000
Subject: [PATCH 139/202] Handle role deletion when project has no databases.
 (#6170)

There is still default 'postgres' database, that may contain objects
owned by the role or some ACLs. We need to reassign objects in this
database too.

## Problem
If customer deleted all databases and then tries to delete role, that
has some non-standard ACLs,
`apply_config` operation will stuck because of failing role deletion.
---
 compute_tools/src/spec.rs | 52 +++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 20299c8fde..d545858dc2 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -370,33 +370,49 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
     Ok(())
 }
 
+fn reassign_owned_objects_in_one_db(
+    conf: Config,
+    role_name: &PgIdent,
+    db_owner: &PgIdent,
+) -> Result<()> {
+    let mut client = conf.connect(NoTls)?;
+
+    // This will reassign all dependent objects to the db owner
+    let reassign_query = format!(
+        "REASSIGN OWNED BY {} TO {}",
+        role_name.pg_quote(),
+        db_owner.pg_quote()
+    );
+    info!(
+        "reassigning objects owned by '{}' in db '{}' to '{}'",
+        role_name,
+        conf.get_dbname().unwrap_or(""),
+        db_owner
+    );
+    client.simple_query(&reassign_query)?;
+
+    // This now will only drop privileges of the role
+    let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
+    client.simple_query(&drop_query)?;
+    Ok(())
+}
+
 // Reassign all owned objects in all databases to the owner of the database.
 fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
     for db in &spec.cluster.databases {
         if db.owner != *role_name {
             let mut conf = Config::from_str(connstr)?;
             conf.dbname(&db.name);
-
-            let mut client = conf.connect(NoTls)?;
-
-            // This will reassign all dependent objects to the db owner
-            let reassign_query = format!(
-                "REASSIGN OWNED BY {} TO {}",
-                role_name.pg_quote(),
-                db.owner.pg_quote()
-            );
-            info!(
-                "reassigning objects owned by '{}' in db '{}' to '{}'",
-                role_name, &db.name, &db.owner
-            );
-            client.simple_query(&reassign_query)?;
-
-            // This now will only drop privileges of the role
-            let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
-            client.simple_query(&drop_query)?;
+            reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
         }
     }
 
+    // Also handle case when there are no databases in the spec.
+    // In this case we need to reassign objects in the default database.
+    let conf = Config::from_str(connstr)?;
+    let db_owner = PgIdent::from_str("cloud_admin")?;
+    reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
+
     Ok(())
 }
 

From 82809d2ec2491fec8ab06219de3d2d03bbb92275 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 17:44:49 +0100
Subject: [PATCH 140/202] fix metric
 `pageserver_initial_logical_size_start_calculation` (#6191)

It wasn't being incremented.

Fixup of

    commit 1c88824ed0e6bfbce02fa92e13ca91d5ab0e37b3
    Author: Christian Schwarz <christian@neon.tech>
    Date:   Fri Dec 1 12:52:59 2023 +0100

        initial logical size calculation: add a bunch of metrics (#5995)
---
 pageserver/src/metrics.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3f0bc3e0a7..4725903783 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -522,14 +522,18 @@ pub(crate) mod initial_logical_size {
     impl StartCalculation {
         pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
+            self.0
+                .with_label_values(&["first", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
             }
         }
         pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
+            self.0
+                .with_label_values(&["retry", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
             }

From 613906acea9a4f98920ba518301ea8195e1687a1 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 19 Dec 2023 21:06:43 +0400
Subject: [PATCH 141/202] Support custom types in broker (#5761)

Old methods are unchanged for backwards compatibility. Added
`SafekeeperDiscoveryRequest` and `SafekeeperDiscoveryResponse` types to
serve as example, and also as a prerequisite for
https://github.com/neondatabase/neon/issues/5471
---
 storage_broker/benches/rps.rs            |  29 ++-
 storage_broker/proto/broker.proto        |  58 +++++
 storage_broker/src/bin/storage_broker.rs | 277 +++++++++++++++++++----
 storage_broker/src/metrics.rs            |  34 ++-
 4 files changed, 350 insertions(+), 48 deletions(-)

diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index a0c8e1f749..d66cbefa45 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -3,9 +3,12 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use clap::Parser;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
+
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SubscribeByFilterRequest,
+    TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage,
+};
 
 use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
 use tokio::time;
@@ -91,15 +94,23 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
         None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
 
-    let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+    let ttid = ProtoTenantTimelineId {
         tenant_id: vec![0xFF; 16],
         timeline_id: tli_from_u64(i),
-    });
-    let request = SubscribeSafekeeperInfoRequest {
-        subscription_key: Some(key),
     };
-    let mut stream = client
-        .subscribe_safekeeper_info(request)
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperTimelineInfo.into(),
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: true,
+            tenant_timeline_id: Some(ttid),
+        }),
+    };
+
+    let mut stream: tonic::Streaming<TypedMessage> = client
+        .subscribe_by_filter(request)
         .await
         .unwrap()
         .into_inner();
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index aa9d62a29f..7d1b63d23f 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -10,6 +10,12 @@ service BrokerService {
 
     // Publish safekeeper updates.
     rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {};
+
+    // Subscribe to all messages, limited by a filter.
+    rpc SubscribeByFilter(SubscribeByFilterRequest) returns (stream TypedMessage) {};
+
+    // Publish one message.
+    rpc PublishOne(TypedMessage) returns (google.protobuf.Empty) {};
 }
 
 message SubscribeSafekeeperInfoRequest {
@@ -48,3 +54,55 @@ message TenantTimelineId {
     bytes tenant_id = 1;
     bytes timeline_id = 2;
 }
+
+message FilterTenantTimelineId {
+    // If true, only messages related to `tenant_timeline_id` will be emitted.
+    // Otherwise, messages for all timelines will be emitted.
+    bool enabled = 1;
+    TenantTimelineId tenant_timeline_id = 2;
+}
+
+message TypeSubscription {
+    MessageType type = 1;
+}
+
+message SubscribeByFilterRequest {
+    // Subscription will emit messages only of the specified types. You need to specify
+    // at least one type to receive any messages.
+    repeated TypeSubscription types = 1;
+
+    // If set and enabled, subscription will emit messages only for the specified tenant/timeline.
+    optional FilterTenantTimelineId tenant_timeline_id = 2;
+}
+
+enum MessageType {
+    UNKNOWN = 0;
+    SAFEKEEPER_TIMELINE_INFO = 2;
+    SAFEKEEPER_DISCOVERY_REQUEST = 3;
+    SAFEKEEPER_DISCOVERY_RESPONSE = 4;
+}
+
+// A message with a type.
+message TypedMessage {
+    MessageType type = 1;
+
+    optional SafekeeperTimelineInfo safekeeper_timeline_info = 2;
+    optional SafekeeperDiscoveryRequest safekeeper_discovery_request = 3;
+    optional SafekeeperDiscoveryResponse safekeeper_discovery_response = 4;
+}
+
+message SafekeeperDiscoveryRequest {
+    TenantTimelineId tenant_timeline_id = 1;
+}
+
+// Shorter version of SafekeeperTimelineInfo, contains only necessary fields.
+message SafekeeperDiscoveryResponse {
+    uint64 safekeeper_id = 1;
+    TenantTimelineId tenant_timeline_id = 2;
+    // WAL available to download.
+    uint64 commit_lsn = 3;
+    // A connection string to use for WAL downloading.
+    string safekeeper_connstr = 4;
+    // Availability zone of a safekeeper.
+    optional string availability_zone = 5;
+}
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 9f81ac6cac..4e5f8ed724 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -35,10 +35,16 @@ use tracing::*;
 use utils::signals::ShutdownSignals;
 
 use metrics::{Encoder, TextEncoder};
-use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
+use storage_broker::metrics::{
+    BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL,
+    NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL,
+};
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage,
+};
 use storage_broker::{
     parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
 };
@@ -73,8 +79,103 @@ struct Args {
     log_format: String,
 }
 
-type PubId = u64; // id of publisher for registering in maps
-type SubId = u64; // id of subscriber for registering in maps
+/// Id of publisher for registering in maps
+type PubId = u64;
+
+/// Id of subscriber for registering in maps
+type SubId = u64;
+
+/// Single enum type for all messages.
+#[derive(Clone, Debug, PartialEq)]
+#[allow(clippy::enum_variant_names)]
+enum Message {
+    SafekeeperTimelineInfo(SafekeeperTimelineInfo),
+    SafekeeperDiscoveryRequest(SafekeeperDiscoveryRequest),
+    SafekeeperDiscoveryResponse(SafekeeperDiscoveryResponse),
+}
+
+impl Message {
+    /// Convert proto message to internal message.
+    pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
+        match proto_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
+                proto_msg.safekeeper_timeline_info.ok_or_else(|| {
+                    Status::new(Code::InvalidArgument, "missing safekeeper_timeline_info")
+                })?,
+            )),
+            MessageType::SafekeeperDiscoveryRequest => Ok(Message::SafekeeperDiscoveryRequest(
+                proto_msg.safekeeper_discovery_request.ok_or_else(|| {
+                    Status::new(
+                        Code::InvalidArgument,
+                        "missing safekeeper_discovery_request",
+                    )
+                })?,
+            )),
+            MessageType::SafekeeperDiscoveryResponse => Ok(Message::SafekeeperDiscoveryResponse(
+                proto_msg.safekeeper_discovery_response.ok_or_else(|| {
+                    Status::new(
+                        Code::InvalidArgument,
+                        "missing safekeeper_discovery_response",
+                    )
+                })?,
+            )),
+            MessageType::Unknown => Err(Status::new(
+                Code::InvalidArgument,
+                format!("invalid message type: {:?}", proto_msg.r#type),
+            )),
+        }
+    }
+
+    /// Get the tenant_timeline_id from the message.
+    pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
+        match self {
+            Message::SafekeeperTimelineInfo(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+            Message::SafekeeperDiscoveryRequest(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+            Message::SafekeeperDiscoveryResponse(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+        }
+    }
+
+    /// Convert internal message to the protobuf struct.
+    pub fn as_typed_message(&self) -> TypedMessage {
+        let mut res = TypedMessage {
+            r#type: self.message_type() as i32,
+            ..Default::default()
+        };
+        match self {
+            Message::SafekeeperTimelineInfo(msg) => {
+                res.safekeeper_timeline_info = Some(msg.clone())
+            }
+            Message::SafekeeperDiscoveryRequest(msg) => {
+                res.safekeeper_discovery_request = Some(msg.clone())
+            }
+            Message::SafekeeperDiscoveryResponse(msg) => {
+                res.safekeeper_discovery_response = Some(msg.clone())
+            }
+        }
+        res
+    }
+
+    /// Get the message type.
+    pub fn message_type(&self) -> MessageType {
+        match self {
+            Message::SafekeeperTimelineInfo(_) => MessageType::SafekeeperTimelineInfo,
+            Message::SafekeeperDiscoveryRequest(_) => MessageType::SafekeeperDiscoveryRequest,
+            Message::SafekeeperDiscoveryResponse(_) => MessageType::SafekeeperDiscoveryResponse,
+        }
+    }
+}
 
 #[derive(Copy, Clone, Debug)]
 enum SubscriptionKey {
@@ -83,7 +184,7 @@ enum SubscriptionKey {
 }
 
 impl SubscriptionKey {
-    // Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
+    /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
     pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
         match key {
             ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
@@ -92,14 +193,29 @@ impl SubscriptionKey {
             }
         }
     }
+
+    /// Parse from FilterTenantTimelineId
+    pub fn from_proto_filter_tenant_timeline_id(
+        f: &FilterTenantTimelineId,
+    ) -> Result<Self, Status> {
+        if !f.enabled {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let ttid =
+            parse_proto_ttid(f.tenant_timeline_id.as_ref().ok_or_else(|| {
+                Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
+            })?)?;
+        Ok(SubscriptionKey::Timeline(ttid))
+    }
 }
 
-// Channel to timeline subscribers.
+/// Channel to timeline subscribers.
 struct ChanToTimelineSub {
-    chan: broadcast::Sender<SafekeeperTimelineInfo>,
-    // Tracked separately to know when delete the shmem entry. receiver_count()
-    // is unhandy for that as unregistering and dropping the receiver side
-    // happens at different moments.
+    chan: broadcast::Sender<Message>,
+    /// Tracked separately to know when delete the shmem entry. receiver_count()
+    /// is unhandy for that as unregistering and dropping the receiver side
+    /// happens at different moments.
     num_subscribers: u64,
 }
 
@@ -110,7 +226,7 @@ struct SharedState {
     num_subs_to_timelines: i64,
     chans_to_timeline_subs: HashMap<TenantTimelineId, ChanToTimelineSub>,
     num_subs_to_all: i64,
-    chan_to_all_subs: broadcast::Sender<SafekeeperTimelineInfo>,
+    chan_to_all_subs: broadcast::Sender<Message>,
 }
 
 impl SharedState {
@@ -146,7 +262,7 @@ impl SharedState {
         &mut self,
         sub_key: SubscriptionKey,
         timeline_chan_size: usize,
-    ) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
+    ) -> (SubId, broadcast::Receiver<Message>) {
         let sub_id = self.next_sub_id;
         self.next_sub_id += 1;
         let sub_rx = match sub_key {
@@ -262,6 +378,29 @@ impl Registry {
             subscriber.id, subscriber.key, subscriber.remote_addr
         );
     }
+
+    /// Send msg to relevant subscribers.
+    pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
+        PROCESSED_MESSAGES_TOTAL.inc();
+
+        // send message to subscribers for everything
+        let shared_state = self.shared_state.read();
+        // Err means there is no subscribers, it is fine.
+        shared_state.chan_to_all_subs.send(msg.clone()).ok();
+
+        // send message to per timeline subscribers, if there is ttid
+        let ttid = msg.tenant_timeline_id()?;
+        if let Some(ttid) = ttid {
+            if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
+                // Err can't happen here, as tx is destroyed only after removing
+                // from the map the last subscriber along with tx.
+                subs.chan
+                    .send(msg.clone())
+                    .expect("rx is still in the map with zero subscribers");
+            }
+        }
+        Ok(())
+    }
 }
 
 // Private subscriber state.
@@ -269,7 +408,7 @@ struct Subscriber {
     id: SubId,
     key: SubscriptionKey,
     // Subscriber receives messages from publishers here.
-    sub_rx: broadcast::Receiver<SafekeeperTimelineInfo>,
+    sub_rx: broadcast::Receiver<Message>,
     // to unregister itself from shared state in Drop
     registry: Registry,
     // for logging
@@ -291,26 +430,9 @@ struct Publisher {
 }
 
 impl Publisher {
-    // Send msg to relevant subscribers.
-    pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> {
-        // send message to subscribers for everything
-        let shared_state = self.registry.shared_state.read();
-        // Err means there is no subscribers, it is fine.
-        shared_state.chan_to_all_subs.send(msg.clone()).ok();
-
-        // send message to per timeline subscribers
-        let ttid =
-            parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| {
-                Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
-            })?)?;
-        if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
-            // Err can't happen here, as tx is destroyed only after removing
-            // from the map the last subscriber along with tx.
-            subs.chan
-                .send(msg.clone())
-                .expect("rx is still in the map with zero subscribers");
-        }
-        Ok(())
+    /// Send msg to relevant subscribers.
+    pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
+        self.registry.send_msg(msg)
     }
 }
 
@@ -339,7 +461,7 @@ impl BrokerService for Broker {
 
         loop {
             match stream.next().await {
-                Some(Ok(msg)) => publisher.send_msg(&msg)?,
+                Some(Ok(msg)) => publisher.send_msg(&Message::SafekeeperTimelineInfo(msg))?,
                 Some(Err(e)) => return Err(e), // grpc error from the stream
                 None => break,                 // closed stream
             }
@@ -371,8 +493,15 @@ impl BrokerService for Broker {
             let mut missed_msgs: u64 = 0;
             loop {
                 match subscriber.sub_rx.recv().await {
-                    Ok(info) => yield info,
+                    Ok(info) => {
+                        match info {
+                            Message::SafekeeperTimelineInfo(info) => yield info,
+                            _ => {},
+                        }
+                        BROADCASTED_MESSAGES_TOTAL.inc();
+                    },
                     Err(RecvError::Lagged(skipped_msg)) => {
+                        BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
                         missed_msgs += skipped_msg;
                         if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
                             warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
@@ -392,6 +521,78 @@ impl BrokerService for Broker {
             Box::pin(output) as Self::SubscribeSafekeeperInfoStream
         ))
     }
+
+    type SubscribeByFilterStream =
+        Pin<Box<dyn Stream<Item = Result<TypedMessage, Status>> + Send + 'static>>;
+
+    /// Subscribe to all messages, limited by a filter.
+    async fn subscribe_by_filter(
+        &self,
+        request: Request<SubscribeByFilterRequest>,
+    ) -> std::result::Result<Response<Self::SubscribeByFilterStream>, Status> {
+        let remote_addr = request
+            .remote_addr()
+            .expect("TCPConnectInfo inserted by handler");
+        let proto_filter = request.into_inner();
+        let ttid_filter = proto_filter
+            .tenant_timeline_id
+            .as_ref()
+            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+
+        let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
+        let types_set = proto_filter
+            .types
+            .iter()
+            .map(|t| t.r#type)
+            .collect::<std::collections::HashSet<_>>();
+
+        let mut subscriber = self.registry.register_subscriber(sub_key, remote_addr);
+
+        // transform rx into stream with item = Result, as method result demands
+        let output = async_stream::try_stream! {
+            let mut warn_interval = time::interval(Duration::from_millis(1000));
+            let mut missed_msgs: u64 = 0;
+            loop {
+                match subscriber.sub_rx.recv().await {
+                    Ok(msg) => {
+                        let msg_type = msg.message_type() as i32;
+                        if types_set.contains(&msg_type) {
+                            yield msg.as_typed_message();
+                            BROADCASTED_MESSAGES_TOTAL.inc();
+                        }
+                    },
+                    Err(RecvError::Lagged(skipped_msg)) => {
+                        BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
+                        missed_msgs += skipped_msg;
+                        if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
+                            warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
+                            missed_msgs = 0;
+                        }
+                    }
+                    Err(RecvError::Closed) => {
+                        // can't happen, we never drop the channel while there is a subscriber
+                        Err(Status::new(Code::Internal, "channel unexpectantly closed"))?;
+                    }
+                }
+            }
+        };
+
+        Ok(Response::new(
+            Box::pin(output) as Self::SubscribeByFilterStream
+        ))
+    }
+
+    /// Publish one message.
+    async fn publish_one(
+        &self,
+        request: Request<TypedMessage>,
+    ) -> std::result::Result<Response<()>, Status> {
+        let msg = Message::from(request.into_inner())?;
+        PUBLISHED_ONEOFF_MESSAGES_TOTAL.inc();
+        self.registry.send_msg(&msg)?;
+        Ok(Response::new(()))
+    }
 }
 
 // We serve only metrics and healthcheck through http1.
@@ -515,8 +716,8 @@ mod tests {
     use tokio::sync::broadcast::error::TryRecvError;
     use utils::id::{TenantId, TimelineId};
 
-    fn msg(timeline_id: Vec<u8>) -> SafekeeperTimelineInfo {
-        SafekeeperTimelineInfo {
+    fn msg(timeline_id: Vec<u8>) -> Message {
+        Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo {
             safekeeper_id: 1,
             tenant_timeline_id: Some(ProtoTenantTimelineId {
                 tenant_id: vec![0x00; 16],
@@ -533,7 +734,7 @@ mod tests {
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
             local_start_lsn: 0,
             availability_zone: None,
-        }
+        })
     }
 
     fn tli_from_u64(i: u64) -> Vec<u8> {
diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs
index f0649d0f68..1fd3dd5ad6 100644
--- a/storage_broker/src/metrics.rs
+++ b/storage_broker/src/metrics.rs
@@ -1,6 +1,6 @@
 //! Broker metrics.
 
-use metrics::{register_int_gauge, IntGauge};
+use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge};
 use once_cell::sync::Lazy;
 
 pub static NUM_PUBS: Lazy<IntGauge> = Lazy::new(|| {
@@ -23,3 +23,35 @@ pub static NUM_SUBS_ALL: Lazy<IntGauge> = Lazy::new(|| {
     )
     .expect("Failed to register metric")
 });
+
+pub static PROCESSED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_processed_messages_total",
+        "Number of messages received by storage broker, before routing and broadcasting"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static BROADCASTED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_broadcasted_messages_total",
+        "Number of messages broadcasted (sent over network) to subscribers"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static BROADCAST_DROPPED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_broadcast_dropped_messages_total",
+        "Number of messages dropped due to channel capacity overflow"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static PUBLISHED_ONEOFF_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_published_oneoff_messages_total",
+        "Number of one-off messages sent via PublishOne method"
+    )
+    .expect("Failed to register metric")
+});

From 58dbca6ce3bda243db96d4e24b146c059d231467 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 19 Dec 2023 15:48:41 -0800
Subject: [PATCH 142/202] Bump vm-builder v0.19.0 -> v0.21.0 (#6197)

Only applicable change was neondatabase/autoscaling#650, reducing the
vector scrape interval (inside the VM) from 15 seconds to 1 second.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 693ed1a66f..6cb6d9df02 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -857,7 +857,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.19.0
+      VM_BUILDER_VERSION: v0.21.0
 
     steps:
       - name: Checkout

From c29df806341d790f9c08e7b9c1f5fd615fb43118 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Dec 2023 10:26:06 +0200
Subject: [PATCH 143/202] fix(layer): move backoff to spawned task (#5746)

Move the backoff to spawned task as it can still be useful; make the
sleep cancellable.
---
 pageserver/src/tenant/storage_layer/layer.rs | 33 +++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 9a8ddc1a6b..8ae911b31e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -878,6 +878,23 @@ impl LayerInner {
                         Ok(())
                     }
                     Err(e) => {
+                        let consecutive_failures =
+                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+
+                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
+                            consecutive_failures.min(u32::MAX as usize) as u32,
+                            1.5,
+                            60.0,
+                        );
+
+                        let backoff = std::time::Duration::from_secs_f64(backoff);
+
+                        tokio::select! {
+                            _ = tokio::time::sleep(backoff) => {},
+                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
+                            _ = timeline.cancel.cancelled() => {},
+                        };
+
                         Err(e)
                     }
                 };
@@ -926,21 +943,9 @@ impl LayerInner {
                 Ok(permit)
             }
             Ok((Err(e), _permit)) => {
-                // FIXME: this should be with the spawned task and be cancellation sensitive
-                //
-                // while we should not need this, this backoff has turned out to be useful with
-                // a bug of unexpectedly deleted remote layer file (#5787).
-                let consecutive_failures =
-                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+                // sleep already happened in the spawned task, if it was not cancelled
+                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                let backoff = utils::backoff::exponential_backoff_duration_seconds(
-                    consecutive_failures.min(u32::MAX as usize) as u32,
-                    1.5,
-                    60.0,
-                );
-                let backoff = std::time::Duration::from_secs_f64(backoff);
-
-                tokio::time::sleep(backoff).await;
                 Err(DownloadError::DownloadFailed)
             }
             Err(_gone) => Err(DownloadError::DownloadCancelled),

From f260f1565e220eb90a3ffe2fd15597735d156d5c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Dec 2023 12:26:20 +0000
Subject: [PATCH 144/202] pageserver: fixes + test updates for sharding (#6186)

This is a precursor to:
- https://github.com/neondatabase/neon/pull/6185

While that PR contains big changes to neon_local and attachment_service,
this PR contains a few unrelated standalone changes generated while
working on that branch:
- Fix restarting a pageserver when it contains multiple shards for the
same tenant
- When using location_config api to attach a tenant, create its
timelines dir
- Update test paths where generations were previously optional to make
them always-on: this avoids tests having to spuriously assert that
attachment_service is not None in order to make the linter happy.
- Add a TenantShardId python implementation for subsequent use in test
helpers that will be made shard-aware
- Teach scrubber to read across shards when checking for layer
existence: this is a refactor to track the list of existent layers at
tenant-level rather than locally to each timeline. This is a precursor
to testing shard splitting.
---
 libs/pageserver_api/src/shard.rs              |   2 +-
 pageserver/src/tenant/mgr.rs                  |  49 +++---
 s3_scrubber/src/checks.rs                     | 144 +++++++++++-------
 s3_scrubber/src/scan_metadata.rs              | 126 +++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |  47 ++----
 test_runner/fixtures/types.py                 |  48 ++++++
 test_runner/performance/test_bulk_insert.py   |   1 -
 .../regress/test_attach_tenant_config.py      |   5 +-
 test_runner/regress/test_change_pageserver.py |   1 -
 .../regress/test_layers_from_future.py        |   1 -
 test_runner/regress/test_pageserver_api.py    |   4 +-
 .../regress/test_pageserver_generations.py    |  10 --
 .../regress/test_pageserver_restart.py        |   4 +-
 .../regress/test_pageserver_secondary.py      |   4 -
 test_runner/regress/test_remote_storage.py    |   2 -
 15 files changed, 293 insertions(+), 155 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 052fbd1402..3668f7939d 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -159,7 +159,7 @@ impl From<[u8; 18]> for TenantShardId {
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
     pub shard_count: ShardCount,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b2f14db9f7..31d80026f0 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -514,10 +514,7 @@ pub async fn init_tenant_mgr(
             &ctx,
         ) {
             Ok(tenant) => {
-                tenants.insert(
-                    TenantShardId::unsharded(tenant.tenant_id()),
-                    TenantSlot::Attached(tenant),
-                );
+                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
             }
             Err(e) => {
                 error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
@@ -962,35 +959,27 @@ impl TenantManager {
         }
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+
+        // Directory structure is the same for attached and secondary modes:
+        // create it if it doesn't exist.  Timeline load/creation expects the
+        // timelines/ subdir to already exist.
+        //
+        // Does not need to be fsync'd because local storage is just a cache.
+        tokio::fs::create_dir_all(&timelines_path)
+            .await
+            .with_context(|| format!("Creating {timelines_path}"))?;
+
+        // Before activating either secondary or attached mode, persist the
+        // configuration, so that on restart we will re-attach (or re-start
+        // secondary) on the tenant.
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
 
         let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => {
-                // Directory doesn't need to be fsync'd because if we crash it can
-                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {tenant_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                TenantSlot::Secondary
-            }
+            LocationMode::Secondary(_) => TenantSlot::Secondary,
             LocationMode::Attached(_attach_config) => {
-                let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-
-                // Directory doesn't need to be fsync'd because we do not depend on
-                // it to exist after crashes: it may be recreated when tenant is
-                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {timelines_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
                 let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
                     self.conf,
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 2acbb2352b..7b9f96dce3 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,9 +1,12 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
+use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
@@ -40,7 +43,7 @@ impl TimelineAnalysis {
 
 pub(crate) fn branch_cleanup_and_check_errors(
     id: &TenantShardTimelineId,
-    s3_root: &RootTarget,
+    tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
     s3_data: Option<S3TimelineBlobData>,
@@ -72,8 +75,8 @@ pub(crate) fn branch_cleanup_and_check_errors(
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
                     index_part,
-                    index_part_generation,
-                    mut s3_layers,
+                    index_part_generation: _index_part_generation,
+                    s3_layers: _s3_layers,
                 } => {
                     if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
                         result.errors.push(format!(
@@ -111,65 +114,19 @@ pub(crate) fn branch_cleanup_and_check_errors(
                             ))
                         }
 
-                        let layer_map_key = (layer, metadata.generation);
-                        if !s3_layers.remove(&layer_map_key) {
+                        if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
                             // FIXME: this will emit false positives if an index was
                             // uploaded concurrently with our scan.  To make this check
                             // correct, we need to try sending a HEAD request for the
                             // layer we think is missing.
                             result.errors.push(format!(
-                                "index_part.json contains a layer {}{} that is not present in remote storage",
-                                layer_map_key.0.file_name(),
-                                layer_map_key.1.get_suffix()
+                                "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
+                                layer.file_name(),
+                                metadata.generation.get_suffix(),
+                                metadata.shard
                             ))
                         }
                     }
-
-                    let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
-                        .into_iter()
-                        .filter(|(_layer_name, gen)|
-                            // A layer is only considered orphaned if it has a generation below
-                            // the index.  If the generation is >= the index, then the layer may
-                            // be an upload from a running pageserver, or even an upload from
-                            // a new generation that didn't upload an index yet.
-                            //
-                            // Even so, a layer that is not referenced by the index could just
-                            // be something enqueued for deletion, so while this check is valid
-                            // for indicating that a layer is garbage, it is not an indicator
-                            // of a problem.
-                            gen < &index_part_generation)
-                        .collect();
-
-                    if !orphan_layers.is_empty() {
-                        // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
-                        // these as a hint that there is something worth cleaning up here.
-                        result.warnings.push(format!(
-                            "index_part.json does not contain layers from S3: {:?}",
-                            orphan_layers
-                                .iter()
-                                .map(|(layer_name, gen)| format!(
-                                    "{}{}",
-                                    layer_name.file_name(),
-                                    gen.get_suffix()
-                                ))
-                                .collect::<Vec<_>>(),
-                        ));
-                        result.garbage_keys.extend(orphan_layers.iter().map(
-                            |(layer_name, layer_gen)| {
-                                let mut key = s3_root.timeline_root(id).prefix_in_bucket;
-                                let delimiter = s3_root.delimiter();
-                                if !key.ends_with(delimiter) {
-                                    key.push_str(delimiter);
-                                }
-                                key.push_str(&format!(
-                                    "{}{}",
-                                    &layer_name.file_name(),
-                                    layer_gen.get_suffix()
-                                ));
-                                key
-                            },
-                        ));
-                    }
                 }
                 BlobDataParseResult::Relic => {}
                 BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -204,6 +161,83 @@ pub(crate) fn branch_cleanup_and_check_errors(
     result
 }
 
+#[derive(Default)]
+pub(crate) struct LayerRef {
+    ref_count: usize,
+}
+
+/// Top-level index of objects in a tenant.  This may be used by any shard-timeline within
+/// the tenant to query whether an object exists.
+#[derive(Default)]
+pub(crate) struct TenantObjectListing {
+    shard_timelines:
+        HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
+}
+
+impl TenantObjectListing {
+    /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
+    /// list of layer keys for the Tenant.
+    pub(crate) fn push(
+        &mut self,
+        ttid: TenantShardTimelineId,
+        layers: HashSet<(LayerFileName, Generation)>,
+    ) {
+        let shard_index = ShardIndex::new(
+            ttid.tenant_shard_id.shard_number,
+            ttid.tenant_shard_id.shard_count,
+        );
+        let replaced = self.shard_timelines.insert(
+            (shard_index, ttid.timeline_id),
+            layers
+                .into_iter()
+                .map(|l| (l, LayerRef::default()))
+                .collect(),
+        );
+
+        assert!(
+            replaced.is_none(),
+            "Built from an S3 object listing, which should never repeat a key"
+        );
+    }
+
+    /// Having loaded a timeline index, check if a layer referenced by the index exists.  If it does,
+    /// the layer's refcount will be incremented.  Later, after calling this for all references in all indices
+    /// in a tenant, orphan layers may be detected by their zero refcounts.
+    ///
+    /// Returns true if the layer exists
+    pub(crate) fn check_ref(
+        &mut self,
+        timeline_id: TimelineId,
+        layer_file: &LayerFileName,
+        metadata: &IndexLayerMetadata,
+    ) -> bool {
+        let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
+            return false;
+        };
+
+        let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
+            return false;
+        };
+
+        layer_ref.ref_count += 1;
+
+        true
+    }
+
+    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
+        let mut result = Vec::new();
+        for ((shard_index, timeline_id), layers) in &self.shard_timelines {
+            for ((layer_file, generation), layer_ref) in layers {
+                if layer_ref.ref_count == 0 {
+                    result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
+                }
+            }
+        }
+
+        result
+    }
+}
+
 #[derive(Debug)]
 pub(crate) struct S3TimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 91347ca21b..bcc4d2e618 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -2,22 +2,25 @@ use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
     branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
-    TimelineAnalysis,
+    TenantObjectListing, TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
+use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
+use utils::id::TenantId;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
     count: usize,
     with_errors: HashSet<TenantShardTimelineId>,
     with_warnings: HashSet<TenantShardTimelineId>,
-    with_garbage: HashSet<TenantShardTimelineId>,
+    with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
 
     layer_count: MinMaxHisto,
@@ -87,7 +90,7 @@ impl MetadataSummary {
             count: 0,
             with_errors: HashSet::new(),
             with_warnings: HashSet::new(),
-            with_garbage: HashSet::new(),
+            with_orphans: HashSet::new(),
             indices_by_version: HashMap::new(),
             layer_count: MinMaxHisto::new(),
             timeline_size_bytes: MinMaxHisto::new(),
@@ -141,6 +144,10 @@ impl MetadataSummary {
         }
     }
 
+    fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) {
+        self.with_orphans.insert(*ttid);
+    }
+
     /// Long-form output for printing at end of a scan
     pub fn summary_string(&self) -> String {
         let version_summary: String = itertools::join(
@@ -154,7 +161,7 @@ impl MetadataSummary {
             "Timelines: {0}
 With errors: {1}
 With warnings: {2}
-With garbage: {3}
+With orphan layers: {3}
 Index versions: {version_summary}
 Timeline size bytes: {4}
 Layer size bytes: {5}
@@ -163,7 +170,7 @@ Timeline layer count: {6}
             self.count,
             self.with_errors.len(),
             self.with_warnings.len(),
-            self.with_garbage.len(),
+            self.with_orphans.len(),
             self.timeline_size_bytes.oneline(),
             self.layer_size_bytes.oneline(),
             self.layer_count.oneline(),
@@ -191,7 +198,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
 
     // Generate a stream of TenantTimelineId
     let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
-    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
@@ -204,17 +211,118 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_buffered(CONCURRENCY);
 
+    // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
+    // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
+
+    let mut tenant_id = None;
+    let mut tenant_objects = TenantObjectListing::default();
+    let mut tenant_timeline_results = Vec::new();
+
+    fn analyze_tenant(
+        tenant_id: TenantId,
+        summary: &mut MetadataSummary,
+        mut tenant_objects: TenantObjectListing,
+        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+    ) {
+        let mut timeline_generations = HashMap::new();
+        for (ttid, data) in timelines {
+            // Stash the generation of each timeline, for later use identifying orphan layers
+            if let BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation,
+                s3_layers: _s3_layers,
+            } = &data.blob_data
+            {
+                timeline_generations.insert(ttid, *index_part_generation);
+            }
+
+            // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+            // reference counts for layers across the tenant.
+            let analysis =
+                branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
+            summary.update_analysis(&ttid, &analysis);
+        }
+
+        // Identifying orphan layers must be done on a tenant-wide basis, because individual
+        // shards' layers may be referenced by other shards.
+        //
+        // Orphan layers are not a corruption, and not an indication of a problem.  They are just
+        // consuming some space in remote storage, and may be cleaned up at leisure.
+        for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() {
+            let ttid = TenantShardTimelineId {
+                tenant_shard_id: TenantShardId {
+                    tenant_id,
+                    shard_count: shard_index.shard_count,
+                    shard_number: shard_index.shard_number,
+                },
+                timeline_id,
+            };
+
+            if let Some(timeline_generation) = timeline_generations.get(&ttid) {
+                if &generation >= timeline_generation {
+                    // Candidate orphan layer is in the current or future generation relative
+                    // to the index we read for this timeline shard, so its absence from the index
+                    // doesn't make it an orphan: more likely, it is a case where the layer was
+                    // uploaded, but the index referencing the layer wasn't written yet.
+                    continue;
+                }
+            }
+
+            let orphan_path = remote_layer_path(
+                &tenant_id,
+                &timeline_id,
+                shard_index,
+                &layer_file,
+                generation,
+            );
+
+            tracing::info!("Orphan layer detected: {orphan_path}");
+
+            summary.notify_timeline_orphan(&ttid);
+        }
+    }
+
+    // Iterate through  all the timeline results.  These are in key-order, so
+    // all results for the same tenant will be adjacent.  We accumulate these,
+    // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
     pin_mut!(timelines);
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
 
-        let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
+        match tenant_id {
+            None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
+            Some(prev_tenant_id) => {
+                if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
+                    let tenant_objects = std::mem::take(&mut tenant_objects);
+                    let timelines = std::mem::take(&mut tenant_timeline_results);
+                    analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
+                    tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                }
+            }
+        }
 
-        summary.update_analysis(&ttid, &analysis);
+        if let BlobDataParseResult::Parsed {
+            index_part: _index_part,
+            index_part_generation: _index_part_generation,
+            s3_layers,
+        } = &data.blob_data
+        {
+            tenant_objects.push(ttid, s3_layers.clone());
+        }
+        tenant_timeline_results.push((ttid, data));
+    }
+
+    if !tenant_timeline_results.is_empty() {
+        analyze_tenant(
+            tenant_id.expect("Must be set if results are present"),
+            &mut summary,
+            tenant_objects,
+            tenant_timeline_results,
+        );
     }
 
     Ok(summary)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 42e122cefe..a9133f1c9c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -457,7 +457,6 @@ class NeonEnvBuilder:
         self.preserve_database_files = preserve_database_files
         self.initial_tenant = initial_tenant or TenantId.generate()
         self.initial_timeline = initial_timeline or TimelineId.generate()
-        self.enable_generations = True
         self.scrub_on_exit = False
         self.test_output_dir = test_output_dir
 
@@ -677,8 +676,7 @@ class NeonEnvBuilder:
 
                 pageserver.stop(immediate=True)
 
-            if self.env.attachment_service is not None:
-                self.env.attachment_service.stop(immediate=True)
+            self.env.attachment_service.stop(immediate=True)
 
             cleanup_error = None
 
@@ -772,13 +770,9 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        if config.enable_generations:
-            attachment_service_port = self.port_distributor.get_port()
-            self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
-            self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
-        else:
-            self.control_plane_api = None
-            self.attachment_service = None
+        attachment_service_port = self.port_distributor.get_port()
+        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
+        self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
 
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
@@ -851,8 +845,7 @@ class NeonEnv:
         # Start up broker, pageserver and all safekeepers
         self.broker.try_start()
 
-        if self.attachment_service is not None:
-            self.attachment_service.start()
+        self.attachment_service.start()
 
         for pageserver in self.pageservers:
             pageserver.start()
@@ -1834,20 +1827,19 @@ class NeonPageserver(PgProtocol):
         """
         client = self.http_client()
         return client.tenant_attach(
-            tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
+            tenant_id,
+            config,
+            config_null,
+            generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
         )
 
     def tenant_detach(self, tenant_id: TenantId):
-        if self.env.attachment_service is not None:
-            self.env.attachment_service.attach_hook_drop(tenant_id)
+        self.env.attachment_service.attach_hook_drop(tenant_id)
 
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
     def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
-        # This API is only for use when generations are enabled
-        assert self.env.attachment_service is not None
-
         if config["mode"].startswith("Attached") and "generation" not in config:
             config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
 
@@ -1873,26 +1865,15 @@ class NeonPageserver(PgProtocol):
         generation: Optional[int] = None,
     ) -> TenantId:
         if generation is None:
-            generation = self.maybe_get_generation(tenant_id)
+            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
         return client.tenant_create(tenant_id, conf, generation=generation)
 
     def tenant_load(self, tenant_id: TenantId):
         client = self.http_client()
-        return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
-
-    def maybe_get_generation(self, tenant_id: TenantId):
-        """
-        For tests that would like to use an HTTP client directly instead of using
-        the `tenant_attach` and `tenant_create` helpers here: issue a generation
-        number for a tenant.
-
-        Returns None if the attachment service is not enabled (legacy mode)
-        """
-        if self.env.attachment_service is not None:
-            return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
-        else:
-            return None
+        return client.tenant_load(
+            tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+        )
 
 
 def append_pageserver_param_overrides(
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index d95368f990..ea648e460d 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -125,3 +125,51 @@ class TenantId(Id):
 class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
+
+
+# Workaround for compat with python 3.9, which does not have `typing.Self`
+TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
+
+
+class TenantShardId:
+    def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
+        self.tenant_id = tenant_id
+        self.shard_number = shard_number
+        self.shard_count = shard_count
+        assert self.shard_number < self.shard_count or self.shard_count == 0
+
+    @classmethod
+    def parse(cls: Type[TTenantShardId], input) -> TTenantShardId:
+        if len(input) == 32:
+            return cls(
+                tenant_id=TenantId(input),
+                shard_number=0,
+                shard_count=0,
+            )
+        elif len(input) == 37:
+            return cls(
+                tenant_id=TenantId(input[0:32]),
+                shard_number=int(input[33:35], 16),
+                shard_count=int(input[35:37], 16),
+            )
+        else:
+            raise ValueError(f"Invalid TenantShardId '{input}'")
+
+    def __str__(self):
+        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+
+    def _tuple(self) -> tuple[TenantId, int, int]:
+        return (self.tenant_id, self.shard_number, self.shard_count)
+
+    def __lt__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+        return self._tuple() < other._tuple()
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+        return self._tuple() == other._tuple()
+
+    def __hash__(self) -> int:
+        return hash(self._tuple())
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index a2a1fa11e5..edc23b29ba 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -61,7 +61,6 @@ def measure_recovery_time(env: NeonCompare):
     # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
     # we will explicitly create the tenant in the same generation that it was previously
     # attached in.
-    assert env.env.attachment_service is not None
     attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
     assert attach_status is not None
     (attach_gen, _) = attach_status
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 352ec13884..32397bbcc1 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -136,10 +136,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
-    body = {}
-    gen = env.pageserver.maybe_get_generation(tenant_id)
-    if gen is not None:
-        body["generation"] = gen
+    body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
 
     ps_http.post(
         f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 1b6c982850..adb67a579e 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -87,7 +87,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     #
     # Since we're dual-attached, need to tip-off attachment service to treat the one we're
     # about to start as the attached pageserver
-    assert env.attachment_service is not None
     env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index ef2b2185c3..340188c1ae 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -157,7 +157,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
 
     def get_generation_number():
-        assert env.attachment_service is not None
         attachment = env.attachment_service.inspect(tenant_id)
         assert attachment is not None
         return attachment[0]
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 64e41a2dd5..573d2139ce 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -72,7 +72,9 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
 
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
-    client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
+    client.tenant_create(
+        tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+    )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
     timelines = client.timeline_list(tenant_id)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4488be31c5..9c2f5786d4 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -187,7 +187,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     - After upgrade, the bucket should contain a mixture.
     - In both cases, postgres I/O should work.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -196,7 +195,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.broker.try_start()
     for sk in env.safekeepers:
         sk.start()
-    assert env.attachment_service is not None
     env.attachment_service.start()
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
@@ -262,12 +260,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
 
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     some_other_pageserver = 1234
     ps_http = env.pageserver.http_client()
@@ -341,7 +337,6 @@ def test_deletion_queue_recovery(
     :param validate_before: whether to wait for deletions to be validated before restart.  This
     makes them elegible to be executed after restart, if the same node keeps the attachment.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -405,7 +400,6 @@ def test_deletion_queue_recovery(
 
     if keep_attachment == KeepAttachment.LOSE:
         some_other_pageserver = 101010
-        assert env.attachment_service is not None
         env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
     env.pageserver.start()
@@ -453,7 +447,6 @@ def test_deletion_queue_recovery(
 
 
 def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -473,7 +466,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # Simulate a major incident: the control plane goes offline
-    assert env.attachment_service is not None
     env.attachment_service.stop()
 
     # Remember how many validations had happened before the control plane went offline
@@ -545,7 +537,6 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
     and must be constructed using the proper generation for the layer, which may not be the same generation
     that the tenant is running in.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -575,7 +566,6 @@ def test_multi_attach(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
 ):
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 3cac32b790..c4499196b5 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -9,9 +9,7 @@ from fixtures.utils import wait_until
 
 # Test restarting page server, while safekeeper and compute node keep
 # running.
-@pytest.mark.parametrize("generations", [True, False])
-def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool):
-    neon_env_builder.enable_generations = generations
+def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 64ade346aa..8ae4297983 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -57,13 +57,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     states are valid, so that we may test it in this way: the API should always
     work as long as the tenant exists.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     pageservers = env.pageservers
     list([p.http_client() for p in pageservers])
@@ -210,13 +208,11 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     """
     Test the sequence of location states that are used in a live migration.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 2
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 3004d69f50..2fda56d0f4 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -60,8 +60,6 @@ def test_remote_storage_backup_and_restore(
 
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
-    neon_env_builder.enable_generations = generations
-
     # Exercise retry code path by making all uploads and downloads fail for the
     # first time. The retries print INFO-messages to the log; we will check
     # that they are present after the test.

From 0f56104a6120876c387fcecb10b8f76dcef77504 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 20 Dec 2023 19:06:55 +0400
Subject: [PATCH 145/202] Make sk_collect_dumps also possible with teleport
 (#4739)

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 scripts/sk_collect_dumps/.gitignore     |  2 +
 scripts/sk_collect_dumps/ansible.cfg    | 11 ++++++
 scripts/sk_collect_dumps/pyproject.toml | 16 ++++++++
 scripts/sk_collect_dumps/readme.md      | 50 +++++++++++++++++--------
 scripts/sk_collect_dumps/remote.yaml    | 33 ++++++++++++----
 scripts/sk_collect_dumps/ssh.cfg        | 13 +++++++
 scripts/sk_collect_dumps/upload.sh      | 26 ++++++-------
 7 files changed, 115 insertions(+), 36 deletions(-)
 create mode 100644 scripts/sk_collect_dumps/ansible.cfg
 create mode 100644 scripts/sk_collect_dumps/pyproject.toml
 create mode 100644 scripts/sk_collect_dumps/ssh.cfg

diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore
index d9d4d0296a..cdf99aefd7 100644
--- a/scripts/sk_collect_dumps/.gitignore
+++ b/scripts/sk_collect_dumps/.gitignore
@@ -1,2 +1,4 @@
 result
 *.json
+hosts
+poetry.lock
diff --git a/scripts/sk_collect_dumps/ansible.cfg b/scripts/sk_collect_dumps/ansible.cfg
new file mode 100644
index 0000000000..150986ab79
--- /dev/null
+++ b/scripts/sk_collect_dumps/ansible.cfg
@@ -0,0 +1,11 @@
+[defaults]
+host_key_checking = False
+inventory=./hosts
+remote_tmp=/tmp
+remote_user=developer
+callbacks_enabled = profile_tasks
+
+[ssh_connection]
+scp_if_ssh = True
+ssh_args = -F ./ssh.cfg
+pipelining = True
diff --git a/scripts/sk_collect_dumps/pyproject.toml b/scripts/sk_collect_dumps/pyproject.toml
new file mode 100644
index 0000000000..c6f6adafe2
--- /dev/null
+++ b/scripts/sk_collect_dumps/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "sk-collect-dumps"
+version = "0.1.0"
+description = ""
+authors = ["Arseny Sher <sher-ars@yandex.ru>"]
+readme = "README.md"
+packages = [{include = "sk_collect_dumps"}]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+ansible = "^9.1.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
index 52b73e9495..7494a6cb78 100644
--- a/scripts/sk_collect_dumps/readme.md
+++ b/scripts/sk_collect_dumps/readme.md
@@ -1,25 +1,43 @@
 # Collect /v1/debug_dump from all safekeeper nodes
 
-1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
-2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
-
-## How to use ansible (staging)
-
+3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
 ```
-AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+# staging:
+AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+# prod:
+AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+# check
+echo $AUTH_TOKEN
+```
+2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
 
-AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+There are two ways to do that, with ssm or tsh. ssm:
+```
+# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit):
+AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml
+```
+It will put the results to .results directory *near the playbook*.
+
+tsh:
+
+Update the inventory, if needed, selecting .build/.tech and optionally region:
+```
+rm -f hosts && echo '[safekeeper]' >> hosts
+# staging:
+tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts
+# prod:
+tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts
 ```
 
-## How to use ansible (prod)
-
+Test ansible connection:
 ```
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+ansible all -m ping -v
 ```
 
+Download the dumps:
+```
+mkdir -p result && rm -f result/*
+ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml
+```
+
+3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml
index 29ce83efde..f214d0ae2c 100644
--- a/scripts/sk_collect_dumps/remote.yaml
+++ b/scripts/sk_collect_dumps/remote.yaml
@@ -1,18 +1,37 @@
 - name: Fetch state dumps from safekeepers
-  hosts: safekeepers
+  hosts: safekeeper
   gather_facts: False
-  remote_user: "{{ remote_user }}"
     
   tasks:
-    - name: Download file
+    - name: Dump file
       get_url:
         url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
-        dest: "/tmp/{{ inventory_hostname }}.json"
+        dest: "/tmp/{{ inventory_hostname }}-dump.json"
+        headers:
+          Authorization: "Bearer {{ auth_token }}"
 
-    - name: Fetch file from remote hosts
+    - name: install rsync
+      ansible.builtin.apt:
+        name: rsync
+        update_cache: yes
+      become: yes
+      ignore_errors: true # it can be already installed and we don't always have sudo
+
+    - name: Fetch file from remote hosts (works only with ssm)
       fetch:
-        src: "/tmp/{{ inventory_hostname }}.json"
-        dest: "./result/{{ inventory_hostname }}.json"
+        src: "/tmp/{{ inventory_hostname }}-dump.json"
+        dest: "./result/{{ inventory_hostname }}-dump.json"
         flat: yes
         fail_on_missing: no
+      when: ansible_connection == "aws_ssm"
 
+    # xxx not sure how to make ansible 'synchronize' work with tsh
+    - name: Fetch file from remote hosts
+      shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json"  "./result/{{ inventory_hostname }}-dump.json"
+      delegate_to: localhost
+      when: ansible_connection != "aws_ssm"
+
+    - name: remove remote dumps
+      ansible.builtin.file:
+        path: "/tmp/{{ inventory_hostname }}-dump.json"
+        state: absent
diff --git a/scripts/sk_collect_dumps/ssh.cfg b/scripts/sk_collect_dumps/ssh.cfg
new file mode 100644
index 0000000000..827c5d9286
--- /dev/null
+++ b/scripts/sk_collect_dumps/ssh.cfg
@@ -0,0 +1,13 @@
+# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh
+
+# Common flags for all teleport.aws.neon.tech hosts
+Host *
+    HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com
+
+# Flags for all teleport.aws.neon.tech hosts except the proxy
+Host * !teleport.aws.neon.tech
+    Port 3022
+    ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p
+    User developer
+
+# End generated Teleport configuration
\ No newline at end of file
diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh
index 2e54ecba1c..5189883fcb 100755
--- a/scripts/sk_collect_dumps/upload.sh
+++ b/scripts/sk_collect_dumps/upload.sh
@@ -31,22 +31,22 @@ SELECT
   (data->>'tenant_id') AS tenant_id,
   (data->>'timeline_id') AS timeline_id,
   (data->'memory'->>'active')::bool AS active,
-  (data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
-  (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
-  (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
-  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
-  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
-  (data->'memory'->>'write_lsn')::bigint AS write_lsn,
+  (data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn,
+  (data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn,
+  (data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn,
+  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn,
+  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn,
+  (data->'memory'->>'write_lsn')::pg_lsn AS write_lsn,
   (data->'memory'->>'num_computes')::bigint AS num_computes,
-  (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
+  (data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn,
   (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
   (data->'memory'->>'is_cancelled')::bool AS is_cancelled,
-  (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
-  (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
+  (data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn,
+  (data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn,
   (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
-  (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
-  (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
-  (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
-  (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
+  (data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn,
+  (data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn,
+  (data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn,
+  (data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn
 FROM tmp_json
 EOF

From ac38d3a88c933f11860f770a39ab984905e01b32 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Dec 2023 17:00:29 +0000
Subject: [PATCH 146/202] remote_storage: don't count 404s as errors (#6201)

## Problem

Currently a chart of S3 error rate is misleading: it can show errors any
time we are attaching a tenant (probing for index_part generation,
checking for remote delete marker).

Considering 404 successful isn't perfectly elegant, but it enables the
error rate to be used a a more meaningful alert signal: it would
indicate if we were having auth issues, sending bad requests, getting
throttled ,etc.

## Summary of changes

Track 404 requests in the AttemptOutcome::Ok bucket instead of the
AttemptOutcome::Err bucket.
---
 libs/remote_storage/src/s3_bucket.rs        | 30 +++++++++++++--------
 test_runner/regress/test_timeline_delete.py |  9 -------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 97fa1bbf5b..d63a5ed99b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -218,14 +218,6 @@ impl S3Bucket {
 
         let started_at = ScopeGuard::into_inner(started_at);
 
-        if get_object.is_err() {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                kind,
-                AttemptOutcome::Err,
-                started_at,
-            );
-        }
-
         match get_object {
             Ok(object_output) => {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
@@ -241,11 +233,27 @@ impl S3Bucket {
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                 Err(DownloadError::NotFound)
             }
-            Err(e) => Err(DownloadError::Other(
-                anyhow::Error::new(e).context("download s3 object"),
-            )),
+            Err(e) => {
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("download s3 object"),
+                ))
+            }
         }
     }
 }
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index c6d578a7a2..82ffcb1177 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -263,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                 ps_http, env.initial_tenant, timeline_id, iterations=iterations
             )
 
-            if failpoint == "timeline-delete-after-index-delete":
-                m = ps_http.get_metrics()
-                assert (
-                    m.query_one(
-                        "remote_storage_s3_request_seconds_count",
-                        filter={"request_type": "get_object", "result": "ok"},
-                    ).value
-                    == 1  # index part for initial timeline
-                )
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
         # this also checks that delete can be retried even when timeline is in Broken state

From 48f156b8a2e1ea69823c355cec4cce86f25676ff Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 20 Dec 2023 20:44:19 +0200
Subject: [PATCH 147/202] feat: relative last activity based eviction (#6136)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new disk usage based eviction option, EvictionOrder, which
selects whether to use the current `AbsoluteAccessed` or this new
proposed but not yet tested `RelativeAccessed`. Additionally a fudge
factor was noticed while implementing this, which might help sparing
smaller tenants at the expense of targeting larger tenants.

Cc: #5304

Co-authored-by: Arpad Müller <arpad@neon.tech>
---
 pageserver/src/config.rs                      |   1 +
 pageserver/src/disk_usage_eviction_task.rs    | 286 ++++++++++++++++--
 pageserver/src/http/routes.rs                 |  15 +-
 .../regress/test_disk_usage_eviction.py       | 116 +++++--
 4 files changed, 363 insertions(+), 55 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index bd63c4d860..8516f397ca 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1468,6 +1468,7 @@ threshold = "20m"
                 period: Duration::from_secs(10),
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
+                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
             })
         );
         match &conf.default_tenant_conf.eviction_policy {
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 76906cfaf7..23b9b573b6 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig {
     pub period: Duration,
     #[cfg(feature = "testing")]
     pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
+/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
+/// partitioning.
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "type", content = "args")]
+pub enum EvictionOrder {
+    /// Order the layers to be evicted by how recently they have been accessed in absolute
+    /// time.
+    ///
+    /// This strategy is unfair when some tenants grow faster than others towards the slower
+    /// growing.
+    #[default]
+    AbsoluteAccessed,
+
+    /// Order the layers to be evicted by how recently they have been accessed relatively within
+    /// the set of resident layers of a tenant.
+    ///
+    /// This strategy will evict layers more fairly but is untested.
+    RelativeAccessed {
+        #[serde(default)]
+        highest_layer_count_loses_first: bool,
+    },
+}
+
+impl EvictionOrder {
+    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
+    /// counts should be the first ones to have their layers evicted.
+    fn highest_layer_count_loses_first(&self) -> bool {
+        match self {
+            EvictionOrder::AbsoluteAccessed => false,
+            EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => *highest_layer_count_loses_first,
+        }
+    }
 }
 
 #[derive(Default)]
@@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
     let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
         .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(
+        state,
+        storage,
+        usage_pre,
+        task_config.eviction_order,
+        cancel,
+    )
+    .await;
     match res {
         Ok(outcome) => {
             debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     state: &State,
     _storage: &GenericRemoteStorage,
     usage_pre: U,
+    eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         "running disk usage based eviction due to pressure"
     );
 
-    let candidates = match collect_eviction_candidates(cancel).await? {
+    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
         EvictionCandidates::Cancelled => {
             return Ok(IterationOutcome::Cancelled);
         }
@@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // Debug-log the list of candidates
     let now = SystemTime::now();
     for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        let nth = i + 1;
         let desc = candidate.layer.layer_desc();
+        let total_candidates = candidates.len();
+        let size = desc.file_size;
+        let rel = candidate.relative_last_activity;
         debug!(
-            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
-            i + 1,
-            candidates.len(),
-            desc.file_size,
+            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
             now.duration_since(candidate.last_activity_ts)
                 .unwrap()
                 .as_micros(),
-            partition,
             desc.tenant_shard_id,
             desc.timeline_id,
             candidate.layer,
@@ -459,6 +506,7 @@ struct EvictionCandidate {
     timeline: Arc<Timeline>,
     layer: Layer,
     last_activity_ts: SystemTime,
+    relative_last_activity: finite_f32::FiniteF32,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -478,24 +526,24 @@ enum EvictionCandidates {
 /// order. A caller that evicts in that order, until pressure is relieved, implements
 /// the eviction policy outlined in the module comment.
 ///
-/// # Example
+/// # Example with EvictionOrder::AbsoluteAccessed
 ///
 /// Imagine that there are two tenants, A and B, with five layers each, a-e.
 /// Each layer has size 100, and both tenant's min_resident_size is 150.
 /// The eviction order would be
 ///
 /// ```text
-/// partition last_activity_ts    tenant/layer
-/// Above     18:30               A/c
-/// Above     19:00               A/b
-/// Above     18:29               B/c
-/// Above     19:05               B/b
-/// Above     20:00               B/a
-/// Above     20:03               A/a
-/// Below     20:30               A/d
-/// Below     20:40               B/d
-/// Below     20:45               B/e
-/// Below     20:58               A/e
+/// partition last_activity_ts tenant/layer
+/// Above     18:30            A/c
+/// Above     19:00            A/b
+/// Above     18:29            B/c
+/// Above     19:05            B/b
+/// Above     20:00            B/a
+/// Above     20:03            A/a
+/// Below     20:30            A/d
+/// Below     20:40            B/d
+/// Below     20:45            B/e
+/// Below     20:58            A/e
 /// ```
 ///
 /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -505,7 +553,77 @@ enum EvictionCandidates {
 /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
+///
+/// # Example with EvictionOrder::RelativeAccessed
+///
+/// ```text
+/// partition relative_age last_activity_ts tenant/layer
+/// Above     0/4          18:30            A/c
+/// Above     0/4          18:29            B/c
+/// Above     1/4          19:00            A/b
+/// Above     1/4          19:05            B/b
+/// Above     2/4          20:00            B/a
+/// Above     2/4          20:03            A/a
+/// Below     3/4          20:30            A/d
+/// Below     3/4          20:40            B/d
+/// Below     4/4          20:45            B/e
+/// Below     4/4          20:58            A/e
+/// ```
+///
+/// With tenants having the same number of layers the picture does not change much. The same with
+/// A having many more layers **resident** (not all of them listed):
+///
+/// ```text
+/// Above       0/100      18:30            A/c
+/// Above       0/4        18:29            B/c
+/// Above       1/100      19:00            A/b
+/// Above       2/100      20:03            A/a
+/// Above       3/100      20:03            A/nth_3
+/// Above       4/100      20:03            A/nth_4
+///             ...
+/// Above       1/4        19:05            B/b
+/// Above      25/100      20:04            A/nth_25
+///             ...
+/// Above       2/4        20:00            B/a
+/// Above      50/100      20:10            A/nth_50
+///             ...
+/// Below       3/4        20:40            B/d
+/// Below      99/100      20:30            A/nth_99
+/// Below       4/4        20:45            B/e
+/// Below     100/100      20:58            A/nth_100
+/// ```
+///
+/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
+/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
+/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
+/// appeared:
+///
+/// ```text
+/// Above       0/87       20:04            A/nth_23
+/// Above       0/3        19:05            B/b
+/// Above       0/50       20:59            C/nth_0
+/// Above       1/87       20:04            A/nth_24
+/// Above       1/50       21:00            C/nth_1
+/// Above       2/87       20:04            A/nth_25
+///             ...
+/// Above      16/50       21:02            C/nth_16
+/// Above       1/3        20:00            B/a
+/// Above      27/87       20:10            A/nth_50
+///             ...
+/// Below       2/3        20:40            B/d
+/// Below      49/50       21:05            C/nth_49
+/// Below      86/87       20:30            A/nth_99
+/// Below       3/3        20:45            B/e
+/// Below      50/50       21:05            C/nth_50
+/// Below      87/87       20:58            A/nth_100
+/// ```
+///
+/// Now relieving pressure with 23 layers would cost:
+/// - tenant A 14 layers
+/// - tenant B 1 layer
+/// - tenant C 8 layers
 async fn collect_eviction_candidates(
+    eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
     // get a snapshot of the list of tenants
@@ -591,12 +709,63 @@ async fn collect_eviction_candidates(
         tenant_candidates
             .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
-        for (timeline, layer_info) in tenant_candidates.into_iter() {
+
+        // keeping the -1 or not decides if every tenant should lose their least recently accessed
+        // layer OR if this should happen in the order of having highest layer count:
+        let fudge = if eviction_order.highest_layer_count_loses_first() {
+            // relative_age vs. tenant layer count:
+            // - 0.1..=1.0 (10 layers)
+            // - 0.01..=1.0 (100 layers)
+            // - 0.001..=1.0 (1000 layers)
+            //
+            // leading to evicting less of the smallest tenants.
+            0
+        } else {
+            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+            // be that less than 10k layer evictions is enough, so we would not need to evict from
+            // all tenants.
+            //
+            // as the tenant ordering is now deterministic this could hit the same tenants
+            // disproportionetly on multiple invocations. alternative could be to remember how many
+            // layers did we evict last time from this tenant, and inject that as an additional
+            // fudge here.
+            1
+        };
+
+        let total = tenant_candidates
+            .len()
+            .checked_sub(fudge)
+            .filter(|&x| x > 0)
+            // support 0 or 1 resident layer tenants as well
+            .unwrap_or(1);
+        let divider = total as f32;
+
+        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
             let file_size = layer_info.file_size();
+
+            // as we iterate this reverse sorted list, the most recently accessed layer will always
+            // be 1.0; this is for us to evict it last.
+            let relative_last_activity = if matches!(
+                eviction_order,
+                EvictionOrder::RelativeAccessed { .. }
+            ) {
+                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
+                // similarly for u16. unsure how it would help.
+                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            } else {
+                finite_f32::FiniteF32::ZERO
+            };
+
             let candidate = EvictionCandidate {
                 timeline,
                 last_activity_ts: layer_info.last_activity_ts,
                 layer: layer_info.layer,
+                relative_last_activity,
             };
             let partition = if cumsum > min_resident_size as i128 {
                 MinResidentSizePartition::Above
@@ -610,8 +779,19 @@ async fn collect_eviction_candidates(
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+
+    match eviction_order {
+        EvictionOrder::AbsoluteAccessed => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.last_activity_ts)
+            });
+        }
+        EvictionOrder::RelativeAccessed { .. } => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            });
+        }
+    }
 
     Ok(EvictionCandidates::Finished(candidates))
 }
@@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey {
     }
 }
 
+/// A totally ordered f32 subset we can use with sorting functions.
+mod finite_f32 {
+
+    /// A totally ordered f32 subset we can use with sorting functions.
+    #[derive(Clone, Copy, PartialEq)]
+    pub struct FiniteF32(f32);
+
+    impl std::fmt::Debug for FiniteF32 {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            std::fmt::Debug::fmt(&self.0, f)
+        }
+    }
+
+    impl std::fmt::Display for FiniteF32 {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            std::fmt::Display::fmt(&self.0, f)
+        }
+    }
+
+    impl std::cmp::Eq for FiniteF32 {}
+
+    impl std::cmp::PartialOrd for FiniteF32 {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    impl std::cmp::Ord for FiniteF32 {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            self.0.total_cmp(&other.0)
+        }
+    }
+
+    impl TryFrom<f32> for FiniteF32 {
+        type Error = f32;
+
+        fn try_from(value: f32) -> Result<Self, Self::Error> {
+            if value.is_finite() {
+                Ok(FiniteF32(value))
+            } else {
+                Err(value)
+            }
+        }
+    }
+
+    impl FiniteF32 {
+        pub const ZERO: FiniteF32 = FiniteF32(0.0);
+
+        pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
+            if (0.0..=1.0).contains(&value) {
+                // -0.0 is within the range, make sure it is assumed 0.0..=1.0
+                let value = value.abs();
+                Ok(FiniteF32(value))
+            } else {
+                Err(value)
+            }
+        }
+    }
+}
+
 mod filesystem_level_usage {
     use anyhow::Context;
     use camino::Utf8Path;
@@ -721,6 +961,7 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
+        use super::EvictionOrder;
         use super::Usage as _;
         use std::time::Duration;
         use utils::serde_percent::Percent;
@@ -732,6 +973,7 @@ mod filesystem_level_usage {
                 period: Duration::MAX,
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
+                eviction_order: EvictionOrder::default(),
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e641e44b08..3ea79ea4f2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1566,19 +1566,22 @@ async fn disk_usage_eviction_run(
     struct Config {
         /// How many bytes to evict before reporting that pressure is relieved.
         evict_bytes: u64,
+
+        #[serde(default)]
+        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
     }
 
     #[derive(Debug, Clone, Copy, serde::Serialize)]
     struct Usage {
         // remains unchanged after instantiation of the struct
-        config: Config,
+        evict_bytes: u64,
         // updated by `add_available_bytes`
         freed_bytes: u64,
     }
 
     impl crate::disk_usage_eviction_task::Usage for Usage {
         fn has_pressure(&self) -> bool {
-            self.config.evict_bytes > self.freed_bytes
+            self.evict_bytes > self.freed_bytes
         }
 
         fn add_available_bytes(&mut self, bytes: u64) {
@@ -1589,7 +1592,7 @@ async fn disk_usage_eviction_run(
     let config = json_request::<Config>(&mut r).await?;
 
     let usage = Usage {
-        config,
+        evict_bytes: config.evict_bytes,
         freed_bytes: 0,
     };
 
@@ -1604,7 +1607,11 @@ async fn disk_usage_eviction_run(
     let state = state.disk_usage_eviction_state.clone();
 
     let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state, storage, usage, &cancel,
+        &state,
+        storage,
+        usage,
+        config.eviction_order,
+        &cancel,
     )
     .await;
 
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index f3f3a1ddf3..9fdc4d59f5 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,6 +1,7 @@
+import enum
 import time
 from dataclasses import dataclass
-from typing import Dict, Tuple
+from typing import Any, Dict, Tuple
 
 import pytest
 import toml
@@ -64,6 +65,23 @@ def test_min_resident_size_override_handling(
     assert_config(tenant_id, None, config_level_override)
 
 
+@enum.unique
+class EvictionOrder(str, enum.Enum):
+    ABSOLUTE_ORDER = "absolute"
+    RELATIVE_ORDER_EQUAL = "relative_equal"
+    RELATIVE_ORDER_SPARE = "relative_spare"
+
+    def config(self) -> Dict[str, Any]:
+        if self == EvictionOrder.ABSOLUTE_ORDER:
+            return {"type": "AbsoluteAccessed"}
+        elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
+            return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}}
+        elif self == EvictionOrder.RELATIVE_ORDER_SPARE:
+            return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}}
+        else:
+            raise RuntimeError(f"not implemented: {self}")
+
+
 @dataclass
 class EvictionEnv:
     timelines: list[Tuple[TenantId, TimelineId]]
@@ -108,13 +126,14 @@ class EvictionEnv:
                     _avg = cur.fetchone()
 
     def pageserver_start_with_disk_usage_eviction(
-        self, period, max_usage_pct, min_avail_bytes, mock_behavior
+        self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder
     ):
         disk_usage_config = {
             "period": period,
             "max_usage_pct": max_usage_pct,
             "min_avail_bytes": min_avail_bytes,
             "mock_statvfs": mock_behavior,
+            "eviction_order": eviction_order.config(),
         }
 
         enc = toml.TomlEncoder()
@@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
-def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_evicts_until_pressure_is_relieved(
+    eviction_env: EvictionEnv, order: EvictionOrder
+):
     """
     Basic test to ensure that we evict enough to relieve pressure.
     """
@@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
 
     target = total_on_disk // 2
 
-    response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = pageserver_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
     assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
 
 
-def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_respects_overridden_resident_size(
+    eviction_env: EvictionEnv, order: EvictionOrder
+):
     """
     Override tenant min resident and ensure that it will be respected by eviction.
     """
@@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
     env.warm_up_tenant(large_tenant[0])
 
     # do one run
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     time.sleep(1)  # give log time to flush
@@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
     assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
 
 
-def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
     """
     If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
     we should continue to evict layers following global LRU.
@@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
     (total_on_disk, _, _) = env.timelines_du()
     target = total_on_disk
 
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
-def test_partial_evict_tenant(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
     """
     Warm up a tenant, then build up pressure to cause in evictions in both.
     We expect
@@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     (total_on_disk, _, _) = env.timelines_du()
     du_by_timeline = env.du_by_timeline()
 
-    # pick any tenant
+    # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
     [warm, cold] = list(du_by_timeline.keys())
     (tenant_id, timeline_id) = warm
 
@@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     # but not enough to fall into global LRU.
     # So, set target to all occupied space, except 2*env.layer_size per tenant
     target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
         ), "all tenants should have lost some layers"
 
     warm_size = later_du_by_timeline[warm]
-
-    # bounds for warmed_size
-    warm_lower = 0.5 * du_by_timeline[warm]
-
-    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-    # So, check for up to 3 here.
-    warm_upper = warm_lower + 3 * env.layer_size
-
     cold_size = later_du_by_timeline[cold]
-    cold_upper = 2 * env.layer_size
 
-    log.info(
-        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
-    )
-    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # bounds for warmed_size
+        warm_lower = 0.5 * du_by_timeline[warm]
 
-    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
-    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+        # So, check for up to 3 here.
+        warm_upper = warm_lower + 3 * env.layer_size
 
-    assert (
-        cold_size < cold_upper
-    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+        cold_upper = 2 * env.layer_size
+        log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
+        log.info(
+            f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+        )
+        log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+        assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+        assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
+        assert (
+            cold_size < cold_upper
+        ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+    else:
+        # just go with the space was freed, find proper limits later
+        pass
 
 
 def poor_mans_du(
@@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
             "type": "Failure",
             "mocked_error": "EIO",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
@@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     def relieved_log_message():
@@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     def relieved_log_message():

From baa1323b4a1d4d38f67101822e1cf20dc38f7ce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Dec 2023 23:38:58 +0100
Subject: [PATCH 148/202] Use ProfileFileCredentialsProvider for AWS SDK
 configuration (#6202)

Allows usage via `aws sso login --profile=<p>; AWS_PROFILE=<p>`. Now
there is no need to manually configure things any more via
`SSO_ACCOUNT_ID` and others. Now one can run the tests locally (given
Neon employee access to aws):

```
aws sso login --profile dev
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty REMOTE_STORAGE_S3_REGION=eu-central-1 REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev AWS_PROFILE=dev
cargo test -p remote_storage -j 1 s3 -- --nocapture
```

Also makes the scrubber use the same region for auth that it does its
operations in (not touching the hard coded role name and start_url
values here, they are not ideal though).
---
 libs/remote_storage/src/s3_bucket.rs | 18 ++++++++++++++----
 s3_scrubber/src/lib.rs               |  8 +++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index d63a5ed99b..98be6f0637 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,6 +16,7 @@ use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
     imds::credentials::ImdsCredentialsProvider,
     meta::credentials::CredentialsProviderChain,
+    profile::ProfileFileCredentialsProvider,
     provider_config::ProviderConfig,
     retry::{RetryConfigBuilder, RetryMode},
     web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -74,20 +75,29 @@ impl S3Bucket {
 
         let region = Some(Region::new(aws_config.bucket_region.clone()));
 
+        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
         let credentials_provider = {
             // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
             CredentialsProviderChain::first_try(
                 "env",
                 EnvironmentVariableCredentialsProvider::new(),
             )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
             // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
             // needed to access remote extensions bucket
-            .or_else("token", {
-                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+            .or_else(
+                "token",
                 WebIdentityTokenCredentialsProvider::builder()
                     .configure(&provider_conf)
-                    .build()
-            })
+                    .build(),
+            )
             // uses imds v2
             .or_else("imds", ImdsCredentialsProvider::builder().build())
         };
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d2338c21e5..8fb1346c8e 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -15,6 +15,7 @@ use anyhow::Context;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
 use aws_sdk_s3::config::Region;
@@ -255,6 +256,11 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         let chain = CredentialsProviderChain::first_try(
             "env",
             EnvironmentVariableCredentialsProvider::new(),
+        )
+        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+        .or_else(
+            "profile-sso",
+            ProfileFileCredentialsProvider::builder().build(),
         );
 
         // Use SSO if we were given an account ID
@@ -265,7 +271,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
                     .account_id(sso_account)
                     .role_name("PowerUserAccess")
                     .start_url("https://neondb.awsapps.com/start")
-                    .region(Region::from_static("eu-central-1"))
+                    .region(bucket_region.clone())
                     .build(),
             ),
             None => chain,

From 48890d206e7f3fca54a06f5ab08955a0e2d512f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 12:52:38 +0100
Subject: [PATCH 149/202] Simplify inject_index_part test function (#6207)

Instead of manually constructing the directory's path, we can just use
the `parent()` function.

This is a drive-by improvement from #6206
---
 pageserver/src/tenant/remote_timeline_client.rs | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 52ee8f49ce..1b0cf39fbe 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,15 +2192,6 @@ mod tests {
 
         let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
 
-        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
-        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
-            timeline_path
-                .strip_prefix(&test_state.harness.conf.workdir)
-                .unwrap(),
-        );
-
-        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
-
         let index_path = test_state.harness.remote_fs_dir.join(
             remote_index_path(
                 &test_state.harness.tenant_shard_id,
@@ -2209,6 +2200,10 @@ mod tests {
             )
             .get_path(),
         );
+
+        std::fs::create_dir_all(index_path.parent().unwrap())
+            .expect("creating test dir should work");
+
         eprintln!("Writing {index_path}");
         std::fs::write(&index_path, index_part_bytes).unwrap();
         example_index_part

From 2df3602a4b3fa87fafb589974aa376719171d910 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 21 Dec 2023 12:00:10 +0000
Subject: [PATCH 150/202] Add GC to http connection pool (#6196)

## Problem

HTTP connection pool will grow without being pruned

## Summary of changes

Remove connection clients from pools once idle, or once they exit.
Periodically clear pool shards.

GC Logic:

Each shard contains a hashmap of `Arc<EndpointPool>`s.
Each connection stores a `Weak<EndpointPool>`.

During a GC sweep, we take a random shard write lock, and check that if
any of the `Arc<EndpointPool>`s are unique (using `Arc::get_mut`).
- If they are unique, then we check that the endpoint-pool is empty, and
sweep if it is.
- If they are not unique, then the endpoint-pool is in active use and we
don't sweep.
- Idle connections will self-clear from the endpoint-pool after 5
minutes.

Technically, the uniqueness of the endpoint-pool should be enough to
consider it empty, but the connection count check is done for
completeness sake.
---
 proxy/src/bin/proxy.rs                |  49 +++-
 proxy/src/config.rs                   |   6 +-
 proxy/src/serverless.rs               |   9 +
 proxy/src/serverless/conn_pool.rs     | 366 ++++++++++++++++++--------
 proxy/src/serverless/sql_over_http.rs |   9 +-
 5 files changed, 321 insertions(+), 118 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index be3989d387..5bc2d377a6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -11,6 +11,7 @@ use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
+use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
 use anyhow::bail;
@@ -95,12 +96,8 @@ struct ProxyCliArgs {
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
-    /// timeout for http connections
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    sql_over_http_timeout: tokio::time::Duration,
-    /// Whether the SQL over http pool is opt-in
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    sql_over_http_pool_opt_in: bool,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
     /// timeout for scram authentication protocol
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     scram_protocol_timeout: tokio::time::Duration,
@@ -138,6 +135,36 @@ struct ProxyCliArgs {
     disable_ip_check_for_http: bool,
 }
 
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// timeout for http connection requests
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
+
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20)]
+    sql_over_http_pool_max_conns_per_endpoint: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    /// Duration each shard will wait on average before a GC sweep.
+    /// A longer time will causes sweeps to take longer but will interfere less frequently.
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    sql_over_http_pool_gc_epoch: tokio::time::Duration,
+
+    /// How many shards should the global pool have. Must be a power of two.
+    /// More shards will introduce less contention for pool operations, but can
+    /// increase memory used by the pool
+    #[clap(long, default_value_t = 128)]
+    sql_over_http_pool_shards: usize,
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let _logging_guard = proxy::logging::init().await?;
@@ -327,8 +354,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
     };
     let http_config = HttpConfig {
-        timeout: args.sql_over_http_timeout,
-        pool_opt_in: args.sql_over_http_pool_opt_in,
+        request_timeout: args.sql_over_http.sql_over_http_timeout,
+        pool_options: GlobalConnPoolOptions {
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
+            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
+            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
+        },
     };
     let authentication_config = AuthenticationConfig {
         scram_protocol_timeout: args.scram_protocol_timeout,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2ed248af8d..610bf7e424 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use crate::{auth, rate_limiter::RateBucketInfo};
+use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
 use rustls::{sign, Certificate, PrivateKey};
 use sha2::{Digest, Sha256};
@@ -36,8 +36,8 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
-    pub timeout: tokio::time::Duration,
-    pub pool_opt_in: bool,
+    pub request_timeout: tokio::time::Duration,
+    pub pool_options: GlobalConnPoolOptions,
 }
 
 pub struct AuthenticationConfig {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index e358a0712f..07825da8dc 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,9 +6,13 @@ mod conn_pool;
 mod sql_over_http;
 mod websocket;
 
+pub use conn_pool::GlobalConnPoolOptions;
+
 use anyhow::bail;
 use hyper::StatusCode;
 use metrics::IntCounterPairGuard;
+use rand::rngs::StdRng;
+use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
@@ -47,6 +51,11 @@ pub async fn task_main(
 
     let conn_pool = conn_pool::GlobalConnPool::new(config);
 
+    let conn_pool2 = Arc::clone(&conn_pool);
+    tokio::spawn(async move {
+        conn_pool2.gc_worker(StdRng::from_entropy()).await;
+    });
+
     // shutdown the connection pool
     tokio::spawn({
         let cancellation_token = cancellation_token.clone();
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index df2d1bea32..c476560215 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,15 +1,19 @@
 use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use dashmap::DashMap;
-use futures::future::poll_fn;
+use futures::{future::poll_fn, Future};
+use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
+use once_cell::sync::Lazy;
 use parking_lot::RwLock;
 use pbkdf2::{
     password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
     Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
+use prometheus::{exponential_buckets, register_histogram, Histogram};
+use rand::Rng;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::IpAddr, sync::Arc};
+use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
     task::{ready, Poll},
@@ -18,7 +22,7 @@ use std::{
     ops::Deref,
     sync::atomic::{self, AtomicUsize},
 };
-use tokio::time;
+use tokio::time::{self, Instant};
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 
 use crate::{
@@ -30,11 +34,10 @@ use crate::{
 };
 use crate::{compute, config};
 
-use tracing::{error, warn, Span};
+use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
 pub const APP_NAME: &str = "/sql_over_http";
-const MAX_CONNS_PER_ENDPOINT: usize = 20;
 
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
@@ -69,6 +72,77 @@ struct ConnPoolEntry {
 pub struct EndpointConnPool {
     pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
     total_conns: usize,
+    max_conns: usize,
+    _guard: IntCounterPairGuard,
+}
+
+impl EndpointConnPool {
+    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        pools
+            .get_mut(&db_user)
+            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+    }
+
+    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+        let conn_id = client.conn_id;
+
+        if client.inner.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return Ok(());
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });
+
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();
+
+                    pool.total_conns += 1;
+                }
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
+    }
 }
 
 /// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
@@ -87,6 +161,27 @@ pub struct DbUserConnPool {
     password_hash: Option<PasswordHashString>,
 }
 
+impl DbUserConnPool {
+    fn clear_closed_clients(&mut self, conns: &mut usize) {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.inner.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+    }
+
+    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
+        self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+        }
+        conn
+    }
+}
+
 pub struct GlobalConnPool {
     // endpoint -> per-endpoint connection pool
     //
@@ -94,52 +189,127 @@ pub struct GlobalConnPool {
     // pool as early as possible and release the lock.
     global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
 
+    /// Number of endpoint-connection pools
+    ///
     /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
     global_pool_size: AtomicUsize,
 
+    proxy_config: &'static crate::config::ProxyConfig,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GlobalConnPoolOptions {
     // Maximum number of connections per one endpoint.
     // Can mix different (dbname, username) connections.
     // When running out of free slots for a particular endpoint,
     // falls back to opening a new connection for each request.
-    max_conns_per_endpoint: usize,
+    pub max_conns_per_endpoint: usize,
 
-    proxy_config: &'static crate::config::ProxyConfig,
+    pub gc_epoch: Duration,
 
-    // Using a lock to remove any race conditions.
-    // Eg cleaning up connections while a new connection is returned
-    closed: RwLock<bool>,
+    pub pool_shards: usize,
+
+    pub idle_timeout: Duration,
+
+    pub opt_in: bool,
 }
 
+pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_pool_reclaimation_lag_seconds",
+        "Time it takes to reclaim unused connection pools",
+        // 1us -> 65ms
+        exponential_buckets(1e-6, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "proxy_http_pool_endpoints_registered_total",
+        "Number of endpoints we have registered pools for",
+        "proxy_http_pool_endpoints_unregistered_total",
+        "Number of endpoints we have unregistered pools for",
+    )
+    .unwrap()
+});
+
 impl GlobalConnPool {
     pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
+        let shards = config.http_config.pool_options.pool_shards;
         Arc::new(Self {
-            global_pool: DashMap::new(),
+            global_pool: DashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
-            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
             proxy_config: config,
-            closed: RwLock::new(false),
         })
     }
 
     pub fn shutdown(&self) {
-        *self.closed.write() = true;
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
 
-        self.global_pool.retain(|_, endpoint_pool| {
-            let mut pool = endpoint_pool.write();
-            // by clearing this hashmap, we remove the slots that a connection can be returned to.
-            // when returning, it drops the connection if the slot doesn't exist
-            pool.pools.clear();
-            pool.total_conns = 0;
+    pub async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
 
-            false
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = GC_LATENCY.start_timer();
+        let current_len = shard.len();
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool {
+                    pools, total_conns, ..
+                } = pool.get_mut();
+
+                // ensure that closed clients are removed
+                pools
+                    .iter_mut()
+                    .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
+
+                // we only remove this pool if it has no active connections
+                if *total_conns == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
         });
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe_duration();
+
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
     }
 
     pub async fn get(
         self: &Arc<Self>,
-        conn_info: &ConnInfo,
+        conn_info: ConnInfo,
         force_new: bool,
         session_id: uuid::Uuid,
         peer_addr: IpAddr,
@@ -147,15 +317,11 @@ impl GlobalConnPool {
         let mut client: Option<ClientInner> = None;
         let mut latency_timer = LatencyTimer::new("http");
 
-        let pool = if force_new {
-            None
-        } else {
-            Some((conn_info.clone(), self.clone()))
-        };
-
         let mut hash_valid = false;
+        let mut endpoint_pool = Weak::new();
         if !force_new {
             let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+            endpoint_pool = Arc::downgrade(&pool);
             let mut hash = None;
 
             // find a pool entry by (dbname, username) if exists
@@ -180,12 +346,8 @@ impl GlobalConnPool {
                 // we will continue with the regular connection flow
                 if validate.is_ok() {
                     hash_valid = true;
-                    let mut pool = pool.write();
-                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                        if let Some(entry) = pool_entries.conns.pop() {
-                            client = Some(entry.conn);
-                            pool.total_conns -= 1;
-                        }
+                    if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
+                        client = Some(entry.conn)
                     }
                 }
             }
@@ -198,11 +360,12 @@ impl GlobalConnPool {
                 info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
                 connect_to_compute(
                     self.proxy_config,
-                    conn_info,
+                    &conn_info,
                     conn_id,
                     session_id,
                     latency_timer,
                     peer_addr,
+                    endpoint_pool.clone(),
                 )
                 .await
             } else {
@@ -214,18 +377,19 @@ impl GlobalConnPool {
                 );
                 latency_timer.pool_hit();
                 latency_timer.success();
-                return Ok(Client::new(client, pool).await);
+                return Ok(Client::new(client, conn_info, endpoint_pool).await);
             }
         } else {
             let conn_id = uuid::Uuid::new_v4();
             info!(%conn_id, "pool: opening a new connection '{conn_info}'");
             connect_to_compute(
                 self.proxy_config,
-                conn_info,
+                &conn_info,
                 conn_id,
                 session_id,
                 latency_timer,
                 peer_addr,
+                endpoint_pool.clone(),
             )
             .await
         };
@@ -269,59 +433,7 @@ impl GlobalConnPool {
             _ => {}
         }
         let new_client = new_client?;
-        Ok(Client::new(new_client, pool).await)
-    }
-
-    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
-        let conn_id = client.conn_id;
-
-        // We want to hold this open while we return. This ensures that the pool can't close
-        // while we are in the middle of returning the connection.
-        let closed = self.closed.read();
-        if *closed {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
-            return Ok(());
-        }
-
-        if client.inner.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
-        }
-
-        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < self.max_conns_per_endpoint {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
-
-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
-
-                    pool.total_conns += 1;
-                }
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-
-        Ok(())
+        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
     }
 
     fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
@@ -334,6 +446,12 @@ impl GlobalConnPool {
         let new_pool = Arc::new(RwLock::new(EndpointConnPool {
             pools: HashMap::new(),
             total_conns: 0,
+            max_conns: self
+                .proxy_config
+                .http_config
+                .pool_options
+                .max_conns_per_endpoint,
+            _guard: ENDPOINT_POOLS.guard(),
         }));
 
         // find or create a pool for this endpoint
@@ -363,9 +481,11 @@ impl GlobalConnPool {
 }
 
 struct TokioMechanism<'a> {
+    pool: Weak<RwLock<EndpointConnPool>>,
     conn_info: &'a ConnInfo,
     session_id: uuid::Uuid,
     conn_id: uuid::Uuid,
+    idle: Duration,
 }
 
 #[async_trait]
@@ -385,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
             timeout,
             self.conn_id,
             self.session_id,
+            self.pool.clone(),
+            self.idle,
         )
         .await
     }
@@ -403,6 +525,7 @@ async fn connect_to_compute(
     session_id: uuid::Uuid,
     latency_timer: LatencyTimer,
     peer_addr: IpAddr,
+    pool: Weak<RwLock<EndpointConnPool>>,
 ) -> anyhow::Result<ClientInner> {
     let tls = config.tls_config.as_ref();
     let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -447,6 +570,8 @@ async fn connect_to_compute(
             conn_id,
             conn_info,
             session_id,
+            pool,
+            idle: config.http_config.pool_options.idle_timeout,
         },
         node_info,
         &extra,
@@ -462,6 +587,8 @@ async fn connect_to_compute_once(
     timeout: time::Duration,
     conn_id: uuid::Uuid,
     mut session: uuid::Uuid,
+    pool: Weak<RwLock<EndpointConnPool>>,
+    idle: Duration,
 ) -> Result<ClientInner, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
@@ -490,13 +617,29 @@ async fn connect_to_compute_once(
         branch_id: node_info.aux.branch_id.clone(),
     };
 
+    let db_user = conn_info.db_and_user();
     tokio::spawn(
         async move {
             let _conn_gauge = conn_gauge;
+            let mut idle_timeout = pin!(tokio::time::sleep(idle));
             poll_fn(move |cx| {
                 if matches!(rx.has_changed(), Ok(true)) {
                     session = *rx.borrow_and_update();
                     info!(%session, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+
+                // 5 minute idle connection timeout
+                if idle_timeout.as_mut().poll(cx).is_ready() {
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                    info!("connection idle");
+                    if let Some(pool) = pool.clone().upgrade() {
+                        // remove client from pool - should close the connection if it's idle.
+                        // does nothing if the client is currently checked-out and in-use
+                        if pool.write().remove_client(db_user.clone(), conn_id) {
+                            info!("idle connection removed");
+                        }
+                    }
                 }
 
                 loop {
@@ -514,15 +657,25 @@ async fn connect_to_compute_once(
                         }
                         Some(Err(e)) => {
                             error!(%session, "connection error: {}", e);
-                            return Poll::Ready(())
+                            break
                         }
                         None => {
                             info!("connection closed");
-                            return Poll::Ready(())
+                            break
                         }
                     }
                 }
-            }).await
+
+                // remove from connection pool
+                if let Some(pool) = pool.clone().upgrade() {
+                    if pool.write().remove_client(db_user.clone(), conn_id) {
+                        info!("closed connection removed");
+                    }
+                }
+
+                Poll::Ready(())
+            }).await;
+
         }
         .instrument(span)
     );
@@ -552,23 +705,27 @@ pub struct Client {
     conn_id: uuid::Uuid,
     span: Span,
     inner: Option<ClientInner>,
-    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool>>,
 }
 
 pub struct Discard<'a> {
     conn_id: uuid::Uuid,
-    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool>>,
 }
 
 impl Client {
     pub(self) async fn new(
         inner: ClientInner,
-        pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+        conn_info: ConnInfo,
+        pool: Weak<RwLock<EndpointConnPool>>,
     ) -> Self {
         Self {
             conn_id: inner.conn_id,
             inner: Some(inner),
             span: Span::current(),
+            conn_info,
             pool,
         }
     }
@@ -577,6 +734,7 @@ impl Client {
             inner,
             pool,
             conn_id,
+            conn_info,
             span: _,
         } = self;
         (
@@ -586,6 +744,7 @@ impl Client {
                 .inner,
             Discard {
                 pool,
+                conn_info,
                 conn_id: *conn_id,
             },
         )
@@ -601,14 +760,14 @@ impl Client {
 
 impl Discard<'_> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        if status != ReadyForQueryStatus::Idle {
-            if let Some((conn_info, _)) = self.pool.take() {
-                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
-            }
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
-        if let Some((conn_info, _)) = self.pool.take() {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
             info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
@@ -628,16 +787,17 @@ impl Deref for Client {
 
 impl Drop for Client {
     fn drop(&mut self) {
+        let conn_info = self.conn_info.clone();
         let client = self
             .inner
             .take()
             .expect("client inner should not be removed");
-        if let Some((conn_info, conn_pool)) = self.pool.take() {
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
             let current_span = self.span.clone();
             // return connection to the pool
             tokio::task::spawn_blocking(move || {
                 let _span = current_span.enter();
-                let _ = conn_pool.put(&conn_info, client);
+                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
     }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 307b085ce0..2e9d8526d3 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -206,7 +206,7 @@ pub async fn handle(
     config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
-        config.timeout,
+        config.request_timeout,
         handle_inner(
             config,
             request,
@@ -278,7 +278,7 @@ pub async fn handle(
         Err(_) => {
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.timeout.as_secs()
+                config.request_timeout.as_secs()
             );
             error!(message);
             json_response(
@@ -320,7 +320,8 @@ async fn handle_inner(
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
-    let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool =
+        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 
@@ -359,7 +360,7 @@ async fn handle_inner(
     let payload: Payload = serde_json::from_slice(&body)?;
 
     let mut client = conn_pool
-        .get(&conn_info, !allow_pool, session_id, peer_addr)
+        .get(conn_info, !allow_pool, session_id, peer_addr)
         .await?;
 
     let mut response = Response::builder()

From 5385791ca6e75167b1f8789d0d995332a4c9f512 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 21 Dec 2023 13:07:23 +0100
Subject: [PATCH 151/202] add pageserver component-level benchmark
 (`pagebench`) (#6174)

This PR adds a component-level benchmarking utility for pageserver.
Its name is `pagebench`.

The problem solved by `pagebench` is that we want to put Pageserver
under high load.

This isn't easily achieved with `pgbench` because it needs to go through
a compute, which has signficant performance overhead compared to
accessing Pageserver directly.

Further, compute has its own performance optimizations (most
importantly: caches). Instead of designing a compute-facing workload
that defeats those internal optimizations, `pagebench` simply bypasses
them by accessing pageserver directly.

Supported benchmarks:

* getpage@latest_lsn
* basebackup
* triggering logical size calculation

This code has no automated users yet.
A performance regression test for getpage@latest_lsn will be added in a
later PR.

part of https://github.com/neondatabase/neon/issues/5771
---
 Cargo.lock                                    |  36 ++
 Cargo.toml                                    |   2 +
 libs/pageserver_api/src/shard.rs              |   4 +
 libs/utils/src/lsn.rs                         |  43 +++
 pageserver/client/src/mgmt_api.rs             |   2 +
 pageserver/client/src/mgmt_api/util.rs        |  49 +++
 pageserver/pagebench/Cargo.toml               |  26 ++
 pageserver/pagebench/src/cmd/basebackup.rs    | 272 ++++++++++++++
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 335 ++++++++++++++++++
 .../cmd/trigger_initial_size_calculation.rs   |  85 +++++
 pageserver/pagebench/src/main.rs              |  48 +++
 pageserver/pagebench/src/util/cli/targets.rs  |  34 ++
 pageserver/pagebench/src/util/connstring.rs   |   8 +
 .../pagebench/src/util/request_stats.rs       |  88 +++++
 .../src/util/tokio_thread_local_stats.rs      |  45 +++
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 16 files changed, 1078 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/client/src/mgmt_api/util.rs
 create mode 100644 pageserver/pagebench/Cargo.toml
 create mode 100644 pageserver/pagebench/src/cmd/basebackup.rs
 create mode 100644 pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
 create mode 100644 pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
 create mode 100644 pageserver/pagebench/src/main.rs
 create mode 100644 pageserver/pagebench/src/util/cli/targets.rs
 create mode 100644 pageserver/pagebench/src/util/connstring.rs
 create mode 100644 pageserver/pagebench/src/util/request_stats.rs
 create mode 100644 pageserver/pagebench/src/util/tokio_thread_local_stats.rs

diff --git a/Cargo.lock b/Cargo.lock
index 0e51e88e3b..0be6d5d183 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2106,6 +2106,20 @@ dependencies = [
  "hashbrown 0.13.2",
 ]
 
+[[package]]
+name = "hdrhistogram"
+version = "7.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
+dependencies = [
+ "base64 0.21.1",
+ "byteorder",
+ "crossbeam-channel",
+ "flate2",
+ "nom",
+ "num-traits",
+]
+
 [[package]]
 name = "heapless"
 version = "0.8.0"
@@ -3057,6 +3071,28 @@ dependencies = [
  "sha2",
 ]
 
+[[package]]
+name = "pagebench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "hdrhistogram",
+ "humantime",
+ "humantime-serde",
+ "pageserver",
+ "pageserver_api",
+ "pageserver_client",
+ "rand 0.8.5",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "pagectl"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 6884de7bf5..5de636778a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
     "pageserver",
     "pageserver/ctl",
     "pageserver/client",
+    "pageserver/pagebench",
     "proxy",
     "safekeeper",
     "storage_broker",
@@ -79,6 +80,7 @@ futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
 hashlink = "0.8.1"
+hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3668f7939d..3e4936eec4 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -81,6 +81,10 @@ impl TenantShardId {
     pub fn is_zero(&self) -> bool {
         self.shard_number == ShardNumber(0)
     }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
 }
 
 /// Formatting helper
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 262dcb8a8a..b3269ae049 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -366,6 +366,49 @@ impl MonotonicCounter<Lsn> for RecordLsn {
     }
 }
 
+/// Implements  [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
+///
+/// This is used by the `pagebench` pageserver benchmarking tool.
+pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
+
+impl rand::distributions::uniform::SampleUniform for Lsn {
+    type Sampler = LsnSampler;
+}
+
+impl rand::distributions::uniform::UniformSampler for LsnSampler {
+    type X = Lsn;
+
+    fn new<B1, B2>(low: B1, high: B2) -> Self
+    where
+        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Self(
+            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
+                low.borrow().0,
+                high.borrow().0,
+            ),
+        )
+    }
+
+    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
+    where
+        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Self(
+            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
+                low.borrow().0,
+                high.borrow().0,
+            ),
+        )
+    }
+
+    fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+        Lsn(self.0.sample(rng))
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::bin_ser::BeSer;
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 0ad4e1551e..87e4ed8efd 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -5,6 +5,8 @@ use utils::{
     id::{TenantId, TimelineId},
 };
 
+pub mod util;
+
 #[derive(Debug)]
 pub struct Client {
     mgmt_api_endpoint: String,
diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs
new file mode 100644
index 0000000000..048a3bb7cd
--- /dev/null
+++ b/pageserver/client/src/mgmt_api/util.rs
@@ -0,0 +1,49 @@
+//! Helpers to do common higher-level tasks with the [`Client`].
+
+use std::sync::Arc;
+
+use tokio::task::JoinSet;
+use utils::id::{TenantId, TenantTimelineId};
+
+use super::Client;
+
+/// Retrieve a list of all of the pageserver's timelines.
+///
+/// Fails if there are sharded tenants present on the pageserver.
+pub async fn get_pageserver_tenant_timelines_unsharded(
+    api_client: &Arc<Client>,
+) -> anyhow::Result<Vec<TenantTimelineId>> {
+    let mut timelines: Vec<TenantTimelineId> = Vec::new();
+    let mut tenants: Vec<TenantId> = Vec::new();
+    for ti in api_client.list_tenants().await? {
+        if !ti.id.is_unsharded() {
+            anyhow::bail!(
+                "only unsharded tenants are supported at this time: {}",
+                ti.id
+            );
+        }
+        tenants.push(ti.id.tenant_id)
+    }
+    let mut js = JoinSet::new();
+    for tenant_id in tenants {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(api_client);
+            async move {
+                (
+                    tenant_id,
+                    mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
+                )
+            }
+        });
+    }
+    while let Some(res) = js.join_next().await {
+        let (tenant_id, details) = res.unwrap();
+        for timeline_id in details.timelines {
+            timelines.push(TenantTimelineId {
+                tenant_id,
+                timeline_id,
+            });
+        }
+    }
+    Ok(timelines)
+}
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
new file mode 100644
index 0000000000..169d9b7f8e
--- /dev/null
+++ b/pageserver/pagebench/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "pagebench"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+futures.workspace = true
+hdrhistogram.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+rand.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+tracing.workspace = true
+tokio.workspace = true
+
+pageserver = { path = ".." }
+pageserver_client.workspace = true
+pageserver_api.workspace = true
+utils = { path = "../../libs/utils/" }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
new file mode 100644
index 0000000000..85a3e695de
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -0,0 +1,272 @@
+use anyhow::Context;
+use pageserver_client::page_service::BasebackupRequest;
+
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{debug, info, instrument};
+
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::ops::Range;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
+use crate::util::{request_stats, tokio_thread_local_stats};
+
+/// basebackup@LatestLSN
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long, default_value = "1")]
+    num_clients: NonZeroUsize,
+    #[clap(long, default_value = "1.0")]
+    gzip_probability: f64,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    completed_requests: AtomicU64,
+}
+
+impl LiveStats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+struct Target {
+    timeline: TenantTimelineId,
+    lsn_range: Option<Range<Lsn>>,
+}
+
+#[derive(serde::Serialize)]
+struct Output {
+    total: request_stats::Output,
+}
+
+tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
+        main_impl(args, thread_local_stats)
+    })
+}
+
+async fn main_impl(
+    args: Args,
+    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
+) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let timeline = *timeline;
+            // FIXME: this triggers initial logical size calculation
+            // https://github.com/neondatabase/neon/issues/6168
+            let info = mgmt_api_client
+                .timeline_info(timeline.tenant_id, timeline.timeline_id)
+                .await
+                .unwrap();
+            async move {
+                anyhow::Ok(Target {
+                    timeline,
+                    // TODO: support lsn_range != latest LSN
+                    lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
+                })
+            }
+        });
+    }
+    let mut all_targets: Vec<Target> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_targets.push(res.unwrap().unwrap());
+    }
+
+    let live_stats = Arc::new(LiveStats::default());
+
+    let num_client_tasks = timelines.len();
+    let num_live_stats_dump = 1;
+    let num_work_sender_tasks = 1;
+
+    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
+
+    tokio::spawn({
+        let stats = Arc::clone(&live_stats);
+        let start_work_barrier = Arc::clone(&start_work_barrier);
+        async move {
+            start_work_barrier.wait().await;
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                info!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut work_senders = HashMap::new();
+    let mut tasks = Vec::new();
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
+        work_senders.insert(tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&all_work_done_barrier),
+            Arc::clone(&live_stats),
+        )));
+    }
+
+    let work_sender = async move {
+        start_work_barrier.wait().await;
+        loop {
+            let (timeline, work) = {
+                let mut rng = rand::thread_rng();
+                let target = all_targets.choose(&mut rng).unwrap();
+                let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
+                (
+                    target.timeline,
+                    Work {
+                        lsn,
+                        gzip: rng.gen_bool(args.gzip_probability),
+                    },
+                )
+            };
+            let sender = work_senders.get(&timeline).unwrap();
+            // TODO: what if this blocks?
+            sender.send(work).await.ok().unwrap();
+        }
+    };
+
+    if let Some(runtime) = args.runtime {
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
+    } else {
+        work_sender.await;
+        unreachable!("work sender never terminates");
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+
+    let output = Output {
+        total: {
+            let mut agg_stats = request_stats::Stats::new();
+            for stats in all_thread_local_stats.lock().unwrap().iter() {
+                let stats = stats.lock().unwrap();
+                agg_stats.add(&stats);
+            }
+            agg_stats.output()
+        },
+    };
+
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
+    anyhow::Ok(())
+}
+
+#[derive(Copy, Clone)]
+struct Work {
+    lsn: Option<Lsn>,
+    gzip: bool,
+}
+
+#[instrument(skip_all)]
+async fn client(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<Work>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    start_work_barrier.wait().await;
+
+    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
+        &args.page_service_host_port,
+        args.pageserver_jwt.as_deref(),
+    ))
+    .await
+    .unwrap();
+
+    while let Some(Work { lsn, gzip }) = work.recv().await {
+        let start = Instant::now();
+        let copy_out_stream = client
+            .basebackup(&BasebackupRequest {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+                lsn,
+                gzip,
+            })
+            .await
+            .with_context(|| format!("start basebackup for {timeline}"))
+            .unwrap();
+
+        use futures::StreamExt;
+        let size = Arc::new(AtomicUsize::new(0));
+        copy_out_stream
+            .for_each({
+                |r| {
+                    let size = Arc::clone(&size);
+                    async move {
+                        let size = Arc::clone(&size);
+                        size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
+                    }
+                }
+            })
+            .await;
+        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
new file mode 100644
index 0000000000..16d198ab0e
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -0,0 +1,335 @@
+use anyhow::Context;
+use futures::future::join_all;
+use pageserver::pgdatadir_mapping::key_to_rel_block;
+use pageserver::repository;
+use pageserver_api::key::is_rel_block_key;
+use pageserver_client::page_service::RelTagBlockNo;
+
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{info, instrument};
+
+use std::collections::HashMap;
+use std::future::Future;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
+use crate::util::{request_stats, tokio_thread_local_stats};
+
+/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long, default_value = "1")]
+    num_clients: NonZeroUsize,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long)]
+    per_target_rate_limit: Option<usize>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    completed_requests: AtomicU64,
+}
+
+impl LiveStats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[derive(Clone)]
+struct KeyRange {
+    timeline: TenantTimelineId,
+    timeline_lsn: Lsn,
+    start: i128,
+    end: i128,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end - self.start
+    }
+}
+
+#[derive(serde::Serialize)]
+struct Output {
+    total: request_stats::Output,
+}
+
+tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
+        main_impl(args, thread_local_stats)
+    })
+}
+
+async fn main_impl(
+    args: Args,
+    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
+) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(&mgmt_api_client);
+            let timeline = *timeline;
+            async move {
+                let partitioning = mgmt_api_client
+                    .keyspace(timeline.tenant_id, timeline.timeline_id)
+                    .await?;
+                let lsn = partitioning.at_lsn;
+
+                let ranges = partitioning
+                    .keys
+                    .ranges
+                    .iter()
+                    .filter_map(|r| {
+                        let start = r.start;
+                        let end = r.end;
+                        // filter out non-relblock keys
+                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
+                            (true, true) => Some(KeyRange {
+                                timeline,
+                                timeline_lsn: lsn,
+                                start: start.to_i128(),
+                                end: end.to_i128(),
+                            }),
+                            (true, false) | (false, true) => {
+                                unimplemented!("split up range")
+                            }
+                            (false, false) => None,
+                        }
+                    })
+                    .collect::<Vec<_>>();
+
+                anyhow::Ok(ranges)
+            }
+        });
+    }
+    let mut all_ranges: Vec<KeyRange> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_ranges.extend(res.unwrap().unwrap());
+    }
+
+    let live_stats = Arc::new(LiveStats::default());
+
+    let num_client_tasks = timelines.len();
+    let num_live_stats_dump = 1;
+    let num_work_sender_tasks = 1;
+
+    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
+
+    tokio::spawn({
+        let stats = Arc::clone(&live_stats);
+        let start_work_barrier = Arc::clone(&start_work_barrier);
+        async move {
+            start_work_barrier.wait().await;
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                info!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut work_senders = HashMap::new();
+    let mut tasks = Vec::new();
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+        work_senders.insert(tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&all_work_done_barrier),
+            Arc::clone(&live_stats),
+        )));
+    }
+
+    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
+        None => Box::pin(async move {
+            let weights = rand::distributions::weighted::WeightedIndex::new(
+                all_ranges.iter().map(|v| v.len()),
+            )
+            .unwrap();
+
+            start_work_barrier.wait().await;
+
+            loop {
+                let (range, key) = {
+                    let mut rng = rand::thread_rng();
+                    let r = &all_ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = repository::Key::from_i128(key);
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    (r, RelTagBlockNo { rel_tag, block_no })
+                };
+                let sender = work_senders.get(&range.timeline).unwrap();
+                // TODO: what if this blocks?
+                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+            }
+        }),
+        Some(rps_limit) => Box::pin(async move {
+            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
+
+            let make_timeline_task: &dyn Fn(
+                TenantTimelineId,
+            )
+                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                let sender = work_senders.get(&timeline).unwrap();
+                let ranges: Vec<KeyRange> = all_ranges
+                    .iter()
+                    .filter(|r| r.timeline == timeline)
+                    .cloned()
+                    .collect();
+                let weights = rand::distributions::weighted::WeightedIndex::new(
+                    ranges.iter().map(|v| v.len()),
+                )
+                .unwrap();
+
+                Box::pin(async move {
+                    let mut ticker = tokio::time::interval(period);
+                    ticker.set_missed_tick_behavior(
+                        /* TODO review this choice */
+                        tokio::time::MissedTickBehavior::Burst,
+                    );
+                    loop {
+                        ticker.tick().await;
+                        let (range, key) = {
+                            let mut rng = rand::thread_rng();
+                            let r = &ranges[weights.sample(&mut rng)];
+                            let key: i128 = rng.gen_range(r.start..r.end);
+                            let key = repository::Key::from_i128(key);
+                            let (rel_tag, block_no) = key_to_rel_block(key)
+                                .expect("we filter non-rel-block keys out above");
+                            (r, RelTagBlockNo { rel_tag, block_no })
+                        };
+                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                    }
+                })
+            };
+
+            let tasks: Vec<_> = work_senders
+                .keys()
+                .map(|tl| make_timeline_task(**tl))
+                .collect();
+
+            start_work_barrier.wait().await;
+
+            join_all(tasks).await;
+        }),
+    };
+
+    if let Some(runtime) = args.runtime {
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
+    } else {
+        work_sender.await;
+        unreachable!("work sender never terminates");
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+
+    let output = Output {
+        total: {
+            let mut agg_stats = request_stats::Stats::new();
+            for stats in all_thread_local_stats.lock().unwrap().iter() {
+                let stats = stats.lock().unwrap();
+                agg_stats.add(&stats);
+            }
+            agg_stats.output()
+        },
+    };
+
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
+    anyhow::Ok(())
+}
+
+#[instrument(skip_all)]
+async fn client(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    start_work_barrier.wait().await;
+
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(timeline.tenant_id, timeline.timeline_id)
+        .await
+        .unwrap();
+
+    while let Some((key, lsn)) = work.recv().await {
+        let start = Instant::now();
+        client
+            .getpage(key, lsn)
+            .await
+            .with_context(|| format!("getpage for {timeline}"))
+            .unwrap();
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
new file mode 100644
index 0000000000..d46ae94e8a
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -0,0 +1,85 @@
+use std::sync::Arc;
+
+use humantime::Duration;
+use tokio::task::JoinSet;
+use utils::id::TenantTimelineId;
+
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(
+        long,
+        help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
+    )]
+    poll_for_completion: Option<Duration>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    // kick it off
+
+    let mut js = JoinSet::new();
+    for tl in timelines {
+        let mgmt_api_client = Arc::clone(&mgmt_api_client);
+        js.spawn(async move {
+            // TODO: API to explicitly trigger initial logical size computation.
+            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
+            // => https://github.com/neondatabase/neon/issues/6168
+            let info = mgmt_api_client
+                .timeline_info(tl.tenant_id, tl.timeline_id)
+                .await
+                .unwrap();
+
+            if let Some(period) = args.poll_for_completion {
+                let mut ticker = tokio::time::interval(period.into());
+                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+                let mut info = info;
+                while !info.current_logical_size_is_accurate {
+                    ticker.tick().await;
+                    info = mgmt_api_client
+                        .timeline_info(tl.tenant_id, tl.timeline_id)
+                        .await
+                        .unwrap();
+                }
+            }
+        });
+    }
+    while let Some(res) = js.join_next().await {
+        let _: () = res.unwrap();
+    }
+    Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
new file mode 100644
index 0000000000..e0120c9212
--- /dev/null
+++ b/pageserver/pagebench/src/main.rs
@@ -0,0 +1,48 @@
+use clap::Parser;
+use utils::logging;
+
+/// Re-usable pieces of code that aren't CLI-specific.
+mod util {
+    pub(crate) mod connstring;
+    pub(crate) mod request_stats;
+    #[macro_use]
+    pub(crate) mod tokio_thread_local_stats;
+    /// Re-usable pieces of CLI-specific code.
+    pub(crate) mod cli {
+        pub(crate) mod targets;
+    }
+}
+
+/// The pagebench CLI sub-commands, dispatched in [`main`] below.
+mod cmd {
+    pub(super) mod basebackup;
+    pub(super) mod getpage_latest_lsn;
+    pub(super) mod trigger_initial_size_calculation;
+}
+
+/// Component-level performance test for pageserver.
+#[derive(clap::Parser)]
+enum Args {
+    Basebackup(cmd::basebackup::Args),
+    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
+    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+}
+
+fn main() {
+    logging::init(
+        logging::LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stderr,
+    )
+    .unwrap();
+
+    let args = Args::parse();
+    match args {
+        Args::Basebackup(args) => cmd::basebackup::main(args),
+        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Args::TriggerInitialSizeCalculation(args) => {
+            cmd::trigger_initial_size_calculation::main(args)
+        }
+    }
+    .unwrap()
+}
diff --git a/pageserver/pagebench/src/util/cli/targets.rs b/pageserver/pagebench/src/util/cli/targets.rs
new file mode 100644
index 0000000000..848eae27cf
--- /dev/null
+++ b/pageserver/pagebench/src/util/cli/targets.rs
@@ -0,0 +1,34 @@
+use std::sync::Arc;
+
+use pageserver_client::mgmt_api;
+use tracing::info;
+use utils::id::TenantTimelineId;
+
+pub(crate) struct Spec {
+    pub(crate) limit_to_first_n_targets: Option<usize>,
+    pub(crate) targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) async fn discover(
+    api_client: &Arc<mgmt_api::Client>,
+    spec: Spec,
+) -> anyhow::Result<Vec<TenantTimelineId>> {
+    let mut timelines = if let Some(targets) = spec.targets {
+        targets
+    } else {
+        mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
+    };
+
+    if let Some(limit) = spec.limit_to_first_n_targets {
+        timelines.sort(); // for determinism
+        timelines.truncate(limit);
+        if timelines.len() < limit {
+            anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
+        }
+    }
+
+    info!("timelines:\n{:?}", timelines);
+    info!("number of timelines:\n{:?}", timelines.len());
+
+    Ok(timelines)
+}
diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs
new file mode 100644
index 0000000000..07a0ff042d
--- /dev/null
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -0,0 +1,8 @@
+pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
+    let colon_and_jwt = if let Some(jwt) = jwt {
+        format!(":{jwt}") // TODO: urlescape
+    } else {
+        String::new()
+    };
+    format!("postgres://postgres{colon_and_jwt}@{host_port}")
+}
diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs
new file mode 100644
index 0000000000..5ecf1cbf24
--- /dev/null
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -0,0 +1,88 @@
+use std::time::Duration;
+
+use anyhow::Context;
+
+pub(crate) struct Stats {
+    latency_histo: hdrhistogram::Histogram<u64>,
+}
+
+impl Stats {
+    pub(crate) fn new() -> Self {
+        Self {
+            // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
+            // which would skew the benchmark results.
+            latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
+        }
+    }
+    pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
+        let micros: u64 = latency
+            .as_micros()
+            .try_into()
+            .context("latency greater than u64")?;
+        self.latency_histo
+            .record(micros)
+            .context("add to histogram")?;
+        Ok(())
+    }
+    pub(crate) fn output(&self) -> Output {
+        let latency_percentiles = std::array::from_fn(|idx| {
+            let micros = self
+                .latency_histo
+                .value_at_percentile(LATENCY_PERCENTILES[idx]);
+            Duration::from_micros(micros)
+        });
+        Output {
+            request_count: self.latency_histo.len(),
+            latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
+            latency_percentiles: LatencyPercentiles {
+                latency_percentiles,
+            },
+        }
+    }
+    pub(crate) fn add(&mut self, other: &Self) {
+        let Self {
+            ref mut latency_histo,
+        } = self;
+        latency_histo.add(&other.latency_histo).unwrap();
+    }
+}
+
+impl Default for Stats {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
+
+struct LatencyPercentiles {
+    latency_percentiles: [Duration; 4],
+}
+
+impl serde::Serialize for LatencyPercentiles {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
+        for p in LATENCY_PERCENTILES {
+            ser.serialize_entry(
+                &format!("p{p}"),
+                &format!(
+                    "{}",
+                    &humantime::format_duration(self.latency_percentiles[0])
+                ),
+            )?;
+        }
+        ser.end()
+    }
+}
+
+#[derive(serde::Serialize)]
+pub(crate) struct Output {
+    request_count: u64,
+    #[serde(with = "humantime_serde")]
+    latency_mean: Duration,
+    latency_percentiles: LatencyPercentiles,
+}
diff --git a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
new file mode 100644
index 0000000000..82526213b6
--- /dev/null
+++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
@@ -0,0 +1,45 @@
+pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
+pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
+
+macro_rules! declare {
+    ($THREAD_LOCAL_NAME:ident: $T:ty) => {
+        thread_local! {
+            pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
+                std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
+            );
+        }
+    };
+}
+
+use std::sync::{Arc, Mutex};
+
+pub(crate) use declare;
+
+macro_rules! main {
+    ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
+        let main_impl = $main_impl;
+        let all = Arc::new(Mutex::new(Vec::new()));
+
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .on_thread_start({
+                let all = Arc::clone(&all);
+                move || {
+                    // pre-initialize the thread local stats by accessesing them
+                    // (some stats like requests_stats::Stats are quite costly to initialize,
+                    //  we don't want to pay that cost during the measurement period)
+                    $THREAD_LOCAL_NAME.with(|stats| {
+                        let stats: Arc<_> = Arc::clone(&*stats.borrow());
+                        all.lock().unwrap().push(stats);
+                    });
+                }
+            })
+            .enable_all()
+            .build()
+            .unwrap();
+
+        let main_task = rt.spawn(main_impl(all));
+        rt.block_on(main_task).unwrap()
+    }};
+}
+
+pub(crate) use main;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b81037ae47..e9884a15f5 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1776,6 +1776,7 @@ pub fn is_inherited_key(key: Key) -> bool {
     key != AUX_FILES_KEY
 }
 
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
@@ -1790,7 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
         _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
-
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }

From f93d15f78124b25e70fb2f61a837c878965a66b6 Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Thu, 21 Dec 2023 13:34:31 +0100
Subject: [PATCH 152/202] add comment to run vacuum for clickbench (#6212)

## Problem

This is a comment only change.
To ensure that our benchmarking results are fair we need to have correct
stats in catalog. Otherwise optimizer chooses seq scan instead of index
only scan for some queries. Added comment to run vacuum after data prep.
---
 test_runner/performance/test_perf_olap.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 1de7e95bbe..1e6e9a0174 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -151,7 +151,9 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
     An OLAP-style ClickHouse benchmark
 
     Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
-    The DB prepared manually in advance
+    The DB prepared manually in advance.
+    Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;`
+    to ensure that Postgres optimizer chooses the same plans as RDS and Aurora.
     """
     explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"
 

From 61b6c4cf3037c17700d12303724fc25de5bbc24c Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Thu, 21 Dec 2023 12:46:51 +0000
Subject: [PATCH 153/202] Build dockerfile from neon repo (#6195)

## Fixing GitHub workflow issue related to build and push images

## Summary of changes
Followup of PR#608[move docker file from build repo to neon to solve
issue some issues

The build started failing because it missed a validation in logic that
determines changes in the docker file
Also, all the dependent jobs were skipped because of the build and push
of the image job.
To address the above issue following changes were made

- we are adding validation to generate image tag even if it's a merge to
repo.
- All the dependent jobs won't skip even if the build and push image job
is skipped.
- We have moved the logic to generate a tag in the sub-workflow. As the
tag name was necessary to be passed to the sub-workflow it made sense to
abstract that away where it was needed and then store it as an output
variable so that downward dependent jobs could access the value.
- This made the dependency logic easy and we don't need complex
expressions to check the condition on which it will run
- An earlier PR was closed that tried solving a similar problem that has
some feedback and context before creating this PR
https://github.com/neondatabase/neon/pull/6175

## Checklist before requesting a review

- [x] Move the tag generation logic from the main workflow to the
sub-workflow of build and push the image
- [x] Add a condition to generate an image tag for a non-PR-related run
- [x] remove complex if the condition from the job if conditions

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Abhijeet Patil <abhijeet@neon.tech>
---
 .../workflows/build_and_push_docker_image.yml | 102 +++++++++++
 .github/workflows/build_and_test.yml          |  47 +++--
 .../workflows/update_build_tools_image.yml    | 130 ++++++++++++++
 .gitignore                                    |   1 +
 CONTRIBUTING.md                               |  14 ++
 Dockerfile                                    |   2 +-
 Dockerfile.buildtools                         | 165 ++++++++++++++++++
 Dockerfile.compute-node                       |   2 +-
 Dockerfile.compute-tools                      |   2 +-
 9 files changed, 443 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/build_and_push_docker_image.yml
 create mode 100644 .github/workflows/update_build_tools_image.yml
 create mode 100644 Dockerfile.buildtools

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
new file mode 100644
index 0000000000..2bdf4a2066
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,102 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+    outputs:
+      build-tools-tag:
+        description: "tag generated for build tools"
+        value: ${{ jobs.tag.outputs.build-tools-tag }}
+
+jobs:
+  tag:
+    runs-on: ubuntu-latest
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+          
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  check-if-build-tools-dockerfile-changed:
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
+            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
+            exit
+          fi
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          
+  kaniko:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+
+  kaniko-arm:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+  manifest:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - tag
+      - kaniko
+      - kaniko-arm
+      - check-if-build-tools-dockerfile-changed
+
+    steps:
+      - name: Create manifest
+        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6cb6d9df02..77f75b7b82 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,7 +44,6 @@ jobs:
 
         exit 1
 
-
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
@@ -74,11 +73,19 @@ jobs:
         shell: bash
         id: build-tag
 
-  check-codestyle-python:
+  build-buildtools-image:
     needs: [ check-permissions ]
+    uses: ./.github/workflows/build_and_push_docker_image.yml
+    with:
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
+    secrets: inherit
+
+  check-codestyle-python:
+    needs: [ check-permissions, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -108,10 +115,10 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -175,10 +182,10 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, tag, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -408,10 +415,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    needs: [ check-permissions, build-neon, tag ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -447,10 +454,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -479,12 +486,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -526,11 +533,10 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests ]
-
+    needs: [ check-permissions, regress-tests, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -694,7 +700,7 @@ jobs:
             }"
 
   neon-image:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -733,6 +739,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -743,7 +750,7 @@ jobs:
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -778,6 +785,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -788,7 +796,7 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -836,6 +844,7 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
new file mode 100644
index 0000000000..88bab797b7
--- /dev/null
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,130 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+    outputs:
+      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
+      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Get source image digest
+        id: next-digest
+        run: |
+          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
+            exit 1
+          fi
+
+          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
+          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Get destination image digest (if already exists)
+        id: prev-digest
+        run: |
+          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
+          if [ -z "${PREV_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
+          else
+            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
+
+            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Tag image
+        run: |
+          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
+
+  rollback-tag-image:
+    needs:  tag-image
+    if: ${{ !success() }}
+
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Restore previous tag if needed
+        run: |
+          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
+          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
+            exit 0
+          fi
+
+          if [ -z "${PREV_DIGEST}" ]; then
+            # I guess we should delete the tag here/untag the image, but crane does not support it
+            # - https://github.com/google/go-containerregistry/issues/999
+
+            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
+
+            exit 0
+          fi
+
+          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
+          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
+            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
+
+            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
+          else
+            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
+          fi
diff --git a/.gitignore b/.gitignore
index c5fc121ac2..3f4495c9e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 test_output/
 .vscode
 .idea
+neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2692684006..b318c295a3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,3 +70,17 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
+
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:
+
+```bash
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 60de9cfa3e..5d5fde4f14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
new file mode 100644
index 0000000000..d3d05b4e20
--- /dev/null
+++ b/Dockerfile.buildtools
@@ -0,0 +1,165 @@
+FROM debian:bullseye-slim
+
+# Add nonroot user
+RUN useradd -ms /bin/bash nonroot -b /home
+SHELL ["/bin/bash", "-c"]
+
+# System deps
+RUN set -e \
+    && apt update \
+    && apt install -y \
+        autoconf \
+        automake \
+        bison \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        flex \
+        git \
+        gnupg \
+        gzip \
+        jq \
+        libcurl4-openssl-dev \
+        libbz2-dev \
+        libffi-dev \
+        liblzma-dev \
+        libncurses5-dev \
+        libncursesw5-dev \
+        libpq-dev \
+        libreadline-dev \
+        libseccomp-dev \
+        libsqlite3-dev \
+        libssl-dev \
+        libstdc++-10-dev \
+        libtool \
+        libxml2-dev \
+        libxmlsec1-dev \
+        libxxhash-dev \
+        lsof \
+        make \
+        netcat \
+        net-tools \
+        openssh-client \
+        parallel \
+        pkg-config \
+        unzip \
+        wget \
+        xz-utils \
+        zlib1g-dev \
+        zstd \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# protobuf-compiler (protoc)
+ENV PROTOC_VERSION 22.2
+RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc
+
+# LLVM
+ENV LLVM_VERSION=17
+RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
+    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && apt update \
+    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
+    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# PostgreSQL 14
+RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
+    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
+    && apt update \
+    && apt install -y postgresql-client-14 \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
+    && unzip -q awscliv2.zip \
+    && ./aws/install \
+    && rm awscliv2.zip
+
+# Mold: A Modern Linker
+ENV MOLD_VERSION v2.1.0
+RUN set -e \
+    && git clone https://github.com/rui314/mold.git \
+    && mkdir mold/build \
+    && cd mold/build \
+    && git checkout ${MOLD_VERSION} \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
+    && cmake --build . -j $(nproc) \
+    && cmake --install . \
+    && cd .. \
+    && rm -rf mold
+
+# LCOV
+# Build lcov from a fork:
+# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
+# And patches from us:
+# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
+RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
+    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
+    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
+    && cd lcov \
+    && make install \
+    && rm -rf ../lcov.tar.gz
+
+# Switch to nonroot user
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Python
+ENV PYTHON_VERSION=3.9.2 \
+    PYENV_ROOT=/home/nonroot/.pyenv \
+    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
+RUN set -e \
+    && cd $HOME \
+    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
+    && chmod +x pyenv-installer \
+    && ./pyenv-installer \
+    && export PYENV_ROOT=/home/nonroot/.pyenv \
+    && export PATH="$PYENV_ROOT/bin:$PATH" \
+    && export PATH="$PYENV_ROOT/shims:$PATH" \
+    && pyenv install ${PYTHON_VERSION} \
+    && pyenv global ${PYTHON_VERSION} \
+    && python --version \
+    && pip install --upgrade pip \
+    && pip --version \
+    && pip install pipenv wheel poetry
+
+# Switch to nonroot user (again)
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Rust
+# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
+ENV RUSTC_VERSION=1.74.0
+ENV RUSTUP_HOME="/home/nonroot/.rustup"
+ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
+	chmod +x rustup-init && \
+	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
+	rm rustup-init && \
+    export PATH="$HOME/.cargo/bin:$PATH" && \
+    . "$HOME/.cargo/env" && \
+    cargo --version && rustup --version && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny && \
+    cargo install cargo-hack && \
+    rm -rf /home/nonroot/.cargo/registry && \
+    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot
+
+# Show versions
+RUN whoami \
+    && python --version \
+    && pip --version \
+    && cargo --version --verbose \
+    && rustup --version --verbose \
+    && rustc --version --verbose \
+    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a23e930c48..8db60ff85f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index 3066e3f7ca..cc305cc556 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 

From 7d6fc3c826827d8bf7dea789e366c43a483884d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 15:23:09 +0100
Subject: [PATCH 154/202] Use pre-generated initdb.tar.zst in
 test_ingest_real_wal (#6206)

This implements the TODO mentioned in the test added by #5892.
---
 pageserver/src/tenant.rs    |  1 +
 pageserver/src/walingest.rs | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d6f1001db..2f2169d194 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3134,6 +3134,7 @@ impl Tenant {
 
     /// For unit tests, make this visible so that other modules can directly create timelines
     #[cfg(test)]
+    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
     pub(crate) async fn bootstrap_timeline_test(
         &self,
         timeline_id: TimelineId,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 16b245c488..1d14214030 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1612,6 +1612,7 @@ impl<'a> WalIngest<'a> {
 mod tests {
     use super::*;
     use crate::tenant::harness::*;
+    use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
     use crate::tenant::Timeline;
     use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
     use postgres_ffi::RELSEG_SIZE;
@@ -2177,21 +2178,25 @@ mod tests {
         let pg_version = 15; // The test data was generated by pg15
         let path = "test_data/sk_wal_segment_from_pgbench";
         let wal_segment_path = format!("{path}/000000010000000000000001.zst");
+        let source_initdb_path = format!("{path}/{INITDB_PATH}");
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
         let endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
+        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
+        let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
+
+        std::fs::create_dir_all(initdb_path.parent().unwrap())
+            .expect("creating test dir should work");
+        std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
+
         // Bootstrap a real timeline. We can't use create_test_timeline because
         // it doesn't create a real checkpoint, and Walingest::new tries to parse
         // the garbage data.
-        //
-        // TODO use the initdb.tar.zst file stored with the test data to avoid
-        //      problems with inconsistent initdb results after pg minor version bumps.
-        let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
-            .unwrap()
-            .load()
-            .await;
         let tline = tenant
-            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
             .await
             .unwrap();
 

From 1dff98be84fb9aa2497ebf0a36b94143ceb4d729 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 21 Dec 2023 14:55:24 +0000
Subject: [PATCH 155/202] CI: fix build-tools image tag for PRs (#6217)

## Problem

Fix build-tools image tag calculation for PRs.
Broken in https://github.com/neondatabase/neon/pull/6195

## Summary of changes
- Use `pinned` tag instead of `$GITHUB_RUN_ID` if there's no changes in
the dockerfile (and we don't build such image)
---
 .../workflows/build_and_push_docker_image.yml | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
index 2bdf4a2066..e401b2f418 100644
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -15,24 +15,6 @@ on:
         value: ${{ jobs.tag.outputs.build-tools-tag }}
 
 jobs:
-  tag:
-    runs-on: ubuntu-latest
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-          
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
   check-if-build-tools-dockerfile-changed:
     runs-on: ubuntu-latest
     outputs:
@@ -51,7 +33,28 @@ jobs:
           fi
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          
+
+  tag:
+    runs-on: ubuntu-latest
+    needs: [ check-if-build-tools-dockerfile-changed ]
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        env:
+          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
   kaniko:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
     needs: [ tag, check-if-build-tools-dockerfile-changed ]

From a21b71977001b7410d68bb1cc2dfa0352061614b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:28:28 +0100
Subject: [PATCH 156/202] Use neon-github-ci-tests S3 bucket for remote_storage
 tests (#6216)

This bucket is already used by the pytests. The current bucket
github-public-dev is more meant for longer living artifacts.

slack thread:
https://neondb.slack.com/archives/C039YKBRZB4/p1703124944669009

Part of https://github.com/neondatabase/cloud/issues/8233 / #6155
---
 .github/workflows/build_and_test.yml    | 2 +-
 .github/workflows/neon_extra_builds.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 77f75b7b82..3091ce6d3a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -345,7 +345,7 @@ jobs:
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
           ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index b1ea5e4f74..c6c2b7386a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -218,7 +218,7 @@ jobs:
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
           cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

From 83000b3824dda8a89e29fea7885a15fbb3f00d90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 21 Dec 2023 18:07:21 +0100
Subject: [PATCH 157/202] buildtools: update protoc and mold (#6222)

These updates aren't very important but I would like to try out the new
process as of #6195
---
 Dockerfile.buildtools | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index d3d05b4e20..77722f173b 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -51,7 +51,7 @@ RUN set -e \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION 22.2
+ENV PROTOC_VERSION 25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
     && unzip -q protoc.zip -d protoc \
     && mv protoc/bin/protoc /usr/local/bin/protoc \
@@ -81,7 +81,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.1.0
+ENV MOLD_VERSION v2.4.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \

From e68ae2888a6baf4efbe683ac889d4deed7fa5f20 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 10:22:22 +0000
Subject: [PATCH 158/202] pageserver: expedite tenant activation on delete
 (#6190)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

During startup, a tenant delete request might have to retry for many
minutes waiting for a tenant to enter Active state.

## Summary of changes

- Refactor delete_tenant into TenantManager: this is not a functional
change, but will avoid merge conflicts with
https://github.com/neondatabase/neon/pull/6105 later
- Add 412 responses to the swagger definition of this endpoint.
- Use Tenant::wait_to_become_active in `TenantManager::delete_tenant`

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/http/openapi_spec.yml      |   6 ++
 pageserver/src/http/routes.rs             |   5 +-
 pageserver/src/tenant/delete.rs           |   3 +
 pageserver/src/tenant/mgr.rs              | 100 ++++++++++++++--------
 test_runner/regress/test_timeline_size.py |  51 +++++++++++
 5 files changed, 129 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b79c5ada9a..1fbca1086f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -159,6 +159,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
+        "412":
+          description: Deletion may not proceed, tenant is not in Active state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"
         "500":
           description: Generic operation error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3ea79ea4f2..11a3a2c872 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -308,6 +308,7 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
             SlotUpsertError(e) => e.into(),
             Other(o) => ApiError::InternalServerError(o),
             e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+            Cancelled => ApiError::ShuttingDown,
         }
     }
 }
@@ -886,7 +887,9 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
+    state
+        .tenant_manager
+        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
             shard = %tenant_shard_id.shard_slug()
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index e8491f26db..b21bad51ba 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError {
     #[error("Timeline {0}")]
     Timeline(#[from] DeleteTimelineError),
 
+    #[error("Cancelled")]
+    Cancelled,
+
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 31d80026f0..62922e8c99 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1091,6 +1091,71 @@ impl TenantManager {
                 .collect(),
         }
     }
+
+    pub(crate) async fn delete_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+        activation_timeout: Duration,
+    ) -> Result<(), DeleteTenantError> {
+        // We acquire a SlotGuard during this function to protect against concurrent
+        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+        // have to return the Tenant to the map while the background deletion runs.
+        //
+        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+        // Currently, deletion requires a reference to the tenants map in order to
+        // keep the Tenant in the map until deletion is complete, and then remove
+        // it at the end.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080
+
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+
+        // unwrap is safe because we used MustExist mode when acquiring
+        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+            TenantSlot::Attached(tenant) => tenant.clone(),
+            _ => {
+                // Express "not attached" as equivalent to "not found"
+                return Err(DeleteTenantError::NotAttached);
+            }
+        };
+
+        match tenant.current_state() {
+            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                // If a tenant is broken or stopping, DeleteTenantFlow can
+                // handle it: broken tenants proceed to delete, stopping tenants
+                // are checked for deletion already in progress.
+            }
+            _ => {
+                tenant
+                    .wait_to_become_active(activation_timeout)
+                    .await
+                    .map_err(|e| match e {
+                        GetActiveTenantError::WillNotBecomeActive(_) => {
+                            DeleteTenantError::InvalidState(tenant.current_state())
+                        }
+                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
+                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
+                        GetActiveTenantError::WaitForActiveTimeout {
+                            latest_state: _latest_state,
+                            wait_time: _wait_time,
+                        } => DeleteTenantError::InvalidState(tenant.current_state()),
+                    })?;
+            }
+        }
+
+        let result = DeleteTenantFlow::run(
+            self.conf,
+            self.resources.remote_storage.clone(),
+            &TENANTS,
+            tenant,
+        )
+        .await;
+
+        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+        slot_guard.revert();
+        result
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1268,41 +1333,6 @@ pub(crate) async fn get_active_tenant_with_timeout(
     Ok(tenant)
 }
 
-pub(crate) async fn delete_tenant(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant_shard_id: TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    // We acquire a SlotGuard during this function to protect against concurrent
-    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-    // have to return the Tenant to the map while the background deletion runs.
-    //
-    // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-    // Currently, deletion requires a reference to the tenants map in order to
-    // keep the Tenant in the map until deletion is complete, and then remove
-    // it at the end.
-    //
-    // See https://github.com/neondatabase/neon/issues/5080
-
-    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
-
-    // unwrap is safe because we used MustExist mode when acquiring
-    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-        TenantSlot::Attached(tenant) => tenant.clone(),
-        _ => {
-            // Express "not attached" as equivalent to "not found"
-            return Err(DeleteTenantError::NotAttached);
-        }
-    };
-
-    let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
-
-    // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-    slot_guard.revert();
-    result
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
     #[error("Tenant {0}")]
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 6e510b2eba..11685d1d48 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import math
 import queue
 import random
@@ -24,6 +25,7 @@ from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
     wait_for_upload_queue_empty,
+    wait_tenant_status_404,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -776,6 +778,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     def get_tenant_states():
         states = {}
+        log.info(f"Tenant ids: {tenant_ids}")
         for tenant_id in tenant_ids:
             tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
             states[tenant_id] = tenant["state"]["slug"]
@@ -872,3 +875,51 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
     assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+
+    # Check that tenant deletion proactively wakes tenants: this is done separately to the main
+    # body of the test because it will disrupt tenant counts
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    wait_until(10, 1, at_least_one_active)
+    delete_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+
+    # Deleting a stuck tenant should prompt it to go active
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        log.info("Starting background delete")
+
+        def delete_tenant():
+            env.pageserver.http_client().tenant_delete(delete_tenant_id)
+
+        background_delete = executor.submit(delete_tenant)
+
+        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+        # on-demand activation was triggered by the tenant deletion
+        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
+
+        def activated_on_demand():
+            assert env.pageserver.log_contains(log_match) is not None
+
+        log.info(f"Waiting for activation message '{log_match}'")
+        try:
+            wait_until(10, 1, activated_on_demand)
+        finally:
+            log.info("Clearing failpoint")
+            pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+        # Deletion should complete successfully now that failpoint is unblocked
+        log.info("Joining background delete")
+        background_delete.result(timeout=10)
+
+        # Poll for deletion to complete
+        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
+        tenant_ids.remove(delete_tenant_id)
+
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes)
+    wait_until(10, 1, all_active)
+    assert len(get_tenant_states()) == n_tenants - 1

From a7342b3897e491de977e5af25bc8c772a5af05b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:13:20 +0100
Subject: [PATCH 159/202] remote_storage: store last_modified and etag in
 Download (#6227)

Store the content of the `last-modified` and `etag` HTTP headers in
`Download`.

This serves both as the first step towards #6199 and as a preparation
for tests in #6155 .
---
 libs/remote_storage/src/azure_blob.rs | 11 +++++++++++
 libs/remote_storage/src/lib.rs        | 11 +++++++++--
 libs/remote_storage/src/local_fs.rs   | 26 ++++++++++++++------------
 libs/remote_storage/src/s3_bucket.rs  |  4 ++++
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 548bde02f6..7ea1103eb2 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -117,6 +117,8 @@ impl AzureBlobStorage {
     ) -> Result<Download, DownloadError> {
         let mut response = builder.into_stream();
 
+        let mut etag = None;
+        let mut last_modified = None;
         let mut metadata = HashMap::new();
         // TODO give proper streaming response instead of buffering into RAM
         // https://github.com/neondatabase/neon/issues/5563
@@ -124,6 +126,13 @@ impl AzureBlobStorage {
         let mut bufs = Vec::new();
         while let Some(part) = response.next().await {
             let part = part.map_err(to_download_error)?;
+            let etag_str: &str = part.blob.properties.etag.as_ref();
+            if etag.is_none() {
+                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+            }
+            if last_modified.is_none() {
+                last_modified = Some(part.blob.properties.last_modified.into());
+            }
             if let Some(blob_meta) = part.blob.metadata {
                 metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
             }
@@ -136,6 +145,8 @@ impl AzureBlobStorage {
         }
         Ok(Download {
             download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+            etag,
+            last_modified,
             metadata: Some(StorageMetadata(metadata)),
         })
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e77c54e1e7..3e408e3119 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -14,7 +14,9 @@ mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
 
-use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
+use std::{
+    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
+};
 
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -207,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }
 
+pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
 pub struct Download {
-    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
+    pub download_stream: DownloadStream,
+    /// The last time the file was modified (`last-modified` HTTP header)
+    pub last_modified: Option<SystemTime>,
+    /// A way to identify this specific version of the resource (`etag` HTTP header)
+    pub etag: Option<String>,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 03b98e5ea2..d1e7d325b9 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs {
                 .map_err(DownloadError::Other)?;
             Ok(Download {
                 metadata,
+                last_modified: None,
+                etag: None,
                 download_stream: Box::pin(source),
             })
         } else {
@@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs {
                 .await
                 .map_err(DownloadError::Other)?;
 
-            Ok(match end_exclusive {
-                Some(end_exclusive) => Download {
-                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(
-                        source.take(end_exclusive - start_inclusive),
-                    )),
-                },
-                None => Download {
-                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(source)),
-                },
+            let download_stream: DownloadStream = match end_exclusive {
+                Some(end_exclusive) => Box::pin(ReaderStream::new(
+                    source.take(end_exclusive - start_inclusive),
+                )),
+                None => Box::pin(ReaderStream::new(source)),
+            };
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream,
             })
         } else {
             Err(DownloadError::NotFound)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 98be6f0637..0f95458ad1 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -231,6 +231,8 @@ impl S3Bucket {
         match get_object {
             Ok(object_output) => {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
+                let etag = object_output.e_tag.clone();
+                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
 
                 let body = object_output.body;
                 let body = ByteStreamAsStream::from(body);
@@ -239,6 +241,8 @@ impl S3Bucket {
 
                 Ok(Download {
                     metadata,
+                    etag,
+                    last_modified,
                     download_stream: Box::pin(body),
                 })
             }

From 572bc060110bf0d81dcc3e6317f12f6417733146 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 22 Dec 2023 20:47:55 +0200
Subject: [PATCH 160/202] Do not copy WAL for lagged slots (#6221)

## Problem

See https://neondb.slack.com/archives/C026T7K2YP9/p1702813041997959

## Summary of changes

Do not take in account invalidated slots when calculate restart_lsn
position for basebackup at page server

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/walproposer_pg.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 9361f08ad2..a197f425a6 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -12,6 +12,7 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "access/xlogutils.h"
 #include "access/xloginsert.h"
@@ -51,6 +52,8 @@
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
 								 * message header */
 
+#define MB ((XLogRecPtr)1024 * 1024)
+
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
 char	   *wal_acceptors_list = "";
@@ -214,7 +217,6 @@ backpressure_lag_impl(void)
 		XLogRecPtr	myFlushLsn = GetFlushRecPtr();
 #endif
 		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
-#define MB ((XLogRecPtr)1024 * 1024)
 
 		elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
 			 LSN_FORMAT_ARGS(myFlushLsn),
@@ -1718,12 +1720,15 @@ walprop_pg_after_election(WalProposer *wp)
 		{
 			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
-			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+			if (max_slot_wal_keep_size_mb <= 0 || lrRestartLsn + max_slot_wal_keep_size_mb*MB > wp->truncateLsn)
+			{
+				/*
+				 * start from the beginning of the segment to fetch page headers
+				 * verifed by XLogReader
+				 */
+				lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
+				wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+			}
 		}
 	}
 }

From cdb08f03621c669a2d6b1efaec89083e0840b4ca Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 23 Oct 2023 17:05:41 +0300
Subject: [PATCH 161/202] Introduce NeonWALReader downloading sk -> compute WAL
 on demand.

It is similar to XLogReader, but when either requested segment is missing
locally or requested LSN is before basebackup_lsn NeonWALReader asynchronously
fetches WAL from one of safekeepers.

Patch includes walproposer switch to NeonWALReader, splitting wouldn't make much
sense as it is hard to test otherwise. This finally removes risk of pg_wal
explosion (as well as slow start time) when one safekeeper is lagging, at the
same time allowing to recover it.

In the future reader should also be used by logical walsender for similar
reasons (currently we download the tail on compute start synchronously).

The main test is test_lagging_sk. However, I also run it manually a lot varying
MAX_SEND_SIZE on both sides (on safekeeper and on walproposer), testing various
fragmentations (one side having small buffer, another, both), which brought up
https://github.com/neondatabase/neon/issues/6055

closes https://github.com/neondatabase/neon/issues/1012
---
 pgxn/neon/Makefile           |   1 +
 pgxn/neon/libpqwalproposer.h |  96 +++++
 pgxn/neon/neon_walreader.c   | 731 +++++++++++++++++++++++++++++++++++
 pgxn/neon/neon_walreader.h   |  29 ++
 pgxn/neon/walproposer.c      | 467 +++++++++++++---------
 pgxn/neon/walproposer.h      | 133 +++----
 pgxn/neon/walproposer_pg.c   | 388 +++++++++++++++----
 7 files changed, 1514 insertions(+), 331 deletions(-)
 create mode 100644 pgxn/neon/libpqwalproposer.h
 create mode 100644 pgxn/neon/neon_walreader.c
 create mode 100644 pgxn/neon/neon_walreader.h

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 466e346e46..c6b224a14d 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,7 @@ OBJS = \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
+	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
diff --git a/pgxn/neon/libpqwalproposer.h b/pgxn/neon/libpqwalproposer.h
new file mode 100644
index 0000000000..cd7e568a47
--- /dev/null
+++ b/pgxn/neon/libpqwalproposer.h
@@ -0,0 +1,96 @@
+/*
+ * Interface to set of libpq wrappers walproposer and neon_walreader need.
+ * Similar to libpqwalreceiver, but it has blocking connection establishment and
+ * pqexec which don't fit us. Implementation is at walproposer_pg.c.
+ */
+#ifndef ___LIBPQWALPROPOSER_H__
+#define ___LIBPQWALPROPOSER_H__
+
+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+
+	/*
+	 * Any success result other than a single CopyBoth was received. The
+	 * specifics of the result were already logged, but it may be useful to
+	 * provide an error message indicating which safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set.
+	 */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+
+	/*
+	 * No result available at this time. Wait until read-ready, then call
+	 * again. Internally, this is returned when PQisBusy indicates that
+	 * PQgetResult would block.
+	 */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
+/* Possible return values from walprop_async_read */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+
+	/*
+	 * The read is ongoing. Wait until the connection is read-ready, then try
+	 * again.
+	 */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from walprop_async_write */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+
+	/*
+	 * The write started, but you'll need to call PQflush some more times to
+	 * finish it off. We just tried, so it's best to wait until the connection
+	 * is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
+/*
+ * This header is included by walproposer.h to define walproposer_api; if we're
+ * building walproposer without pg, ignore libpq part, leaving only interface
+ * types.
+ */
+#ifndef WALPROPOSER_LIB
+
+#include "libpq-fe.h"
+
+/*
+ * Sometimes working directly with underlying PGconn is simpler, export the
+ * whole thing for simplicity.
+ */
+typedef struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received CopyData message from
+								 * walprop_async_read */
+} WalProposerConn;
+
+extern WalProposerConn *libpqwp_connect_start(char *conninfo);
+extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
+extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
+extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
+extern void libpqwp_disconnect(WalProposerConn *conn);
+
+#endif							/* WALPROPOSER_LIB */
+#endif							/* ___LIBPQWALPROPOSER_H__ */
diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
new file mode 100644
index 0000000000..f035c2928f
--- /dev/null
+++ b/pgxn/neon/neon_walreader.c
@@ -0,0 +1,731 @@
+/*
+ * Like WALRead, but when WAL segment doesn't exist locally instead of throwing
+ * ERROR asynchronously tries to fetch it from the most advanced safekeeper.
+ *
+ * We can't use libpqwalreceiver as it blocks during connection establishment
+ * (and waiting for PQExec result), so use libpqwalproposer instead.
+ *
+ * TODO: keepalives are currently never sent, so the other side can close the
+ * connection prematurely.
+ *
+ * TODO: close conn if reading takes too long to prevent stuck connections.
+ */
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "access/xlogdefs.h"
+#include "access/xlogreader.h"
+#include "libpq/pqformat.h"
+#include "storage/fd.h"
+#include "utils/wait_event.h"
+
+#include "libpq-fe.h"
+
+#include "neon_walreader.h"
+#include "walproposer.h"
+
+#define NEON_WALREADER_ERR_MSG_LEN 512
+
+/*
+ * Can be called where NeonWALReader *state is available in the context, adds log_prefix.
+ */
+#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
+
+static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
+static void NeonWALReaderResetRemote(NeonWALReader *state);
+static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
+static void neon_wal_segment_close(NeonWALReader *state);
+static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
+								  TimeLineID tli);
+
+/*
+ * State of connection to donor safekeeper.
+ */
+typedef enum
+{
+	/* no remote connection */
+	RS_NONE,
+	/* doing PQconnectPoll, need readable socket */
+	RS_CONNECTING_READ,
+	/* doing PQconnectPoll, need writable socket */
+	RS_CONNECTING_WRITE,
+	/* Waiting for START_REPLICATION result */
+	RS_WAIT_EXEC_RESULT,
+	/* replication stream established */
+	RS_ESTABLISHED,
+} NeonWALReaderRemoteState;
+
+struct NeonWALReader
+{
+	/*
+	 * LSN before which we assume WAL is not available locally. Exists because
+	 * though first segment after startup always exists, part before
+	 * basebackup LSN is filled with zeros.
+	 */
+	XLogRecPtr	available_lsn;
+	WALSegmentContext segcxt;
+	WALOpenSegment seg;
+	int			wre_errno;
+	/* Explains failure to read, static for simplicity. */
+	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];
+
+	/*
+	 * Saved info about request in progress, used to check validity of
+	 * arguments after resume and remember how far we accomplished it. req_lsn
+	 * is 0 if there is no request in progress.
+	 */
+	XLogRecPtr	req_lsn;
+	Size		req_len;
+	Size		req_progress;
+	WalProposer *wp;			/* we learn donor through walproposer */
+	char		donor_name[64]; /* saved donor safekeeper name for logging */
+	/* state of connection to safekeeper */
+	NeonWALReaderRemoteState rem_state;
+	WalProposerConn *wp_conn;
+
+	/*
+	 * position in wp_conn recvbuf from which we'll copy WAL next time, or
+	 * NULL if there is no unprocessed message
+	 */
+	char	   *wal_ptr;
+	Size		wal_rem_len;	/* how many unprocessed bytes left in recvbuf */
+
+	/*
+	 * LSN of wal_ptr position according to walsender to cross check against
+	 * read request
+	 */
+	XLogRecPtr	rem_lsn;
+
+	/* prepended to lines logged by neon_walreader, if provided */
+	char		log_prefix[64];
+};
+
+/* palloc and initialize NeonWALReader */
+NeonWALReader *
+NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
+{
+	NeonWALReader *reader;
+
+	reader = (NeonWALReader *)
+		palloc_extended(sizeof(NeonWALReader),
+						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
+	if (!reader)
+		return NULL;
+
+	reader->available_lsn = available_lsn;
+	reader->seg.ws_file = -1;
+	reader->seg.ws_segno = 0;
+	reader->seg.ws_tli = 0;
+	reader->segcxt.ws_segsize = wal_segment_size;
+
+	reader->wp = wp;
+
+	reader->rem_state = RS_NONE;
+
+	if (log_prefix)
+		strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
+
+	return reader;
+}
+
+void
+NeonWALReaderFree(NeonWALReader *state)
+{
+	if (state->seg.ws_file != -1)
+		neon_wal_segment_close(state);
+	if (state->wp_conn)
+		libpqwp_disconnect(state->wp_conn);
+	pfree(state);
+}
+
+/*
+ * Like vanilla WALRead, but if requested position is before available_lsn or
+ * WAL segment doesn't exist on disk, it tries to fetch needed segment from the
+ * advanced safekeeper.
+ *
+ * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
+ * fetched from timeline 'tli'.
+ *
+ * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
+ * occurs, in which case 'err' has the desciption. Error always closes remote
+ * connection, if there was any, so socket subscription should be removed.
+ *
+ * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
+ * NeonWALReaderSocket and call NeonWALRead again with exactly the same
+ * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
+ * docs during connection establishment (before first successful read) socket
+ * underneath might change.
+ *
+ * Also, eventually walreader should switch from remote to local read; caller
+ * should remove subscription to socket then by checking NeonWALReaderEvents
+ * after successful read (otherwise next read might reopen the connection with
+ * different socket).
+ *
+ * Reading not monotonically is not supported and will result in error.
+ *
+ * Caller should be sure that WAL up to requested LSN exists, otherwise
+ * NEON_WALREAD_WOULDBLOCK might be always returned.
+ */
+NeonWALReadResult
+NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	/*
+	 * If requested data is before known available basebackup lsn or there is
+	 * already active remote state, do remote read.
+	 */
+	if (startptr < state->available_lsn || state->rem_state != RS_NONE)
+	{
+		return NeonWALReadRemote(state, buf, startptr, count, tli);
+	}
+	if (NeonWALReadLocal(state, buf, startptr, count, tli))
+	{
+		return NEON_WALREAD_SUCCESS;
+	}
+	else if (state->wre_errno == ENOENT)
+	{
+		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr));
+		return NeonWALReadRemote(state, buf, startptr, count, tli);
+	}
+	else
+	{
+		return NEON_WALREAD_ERROR;
+	}
+}
+
+/* Do the read from remote safekeeper. */
+static NeonWALReadResult
+NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	if (state->rem_state == RS_NONE)
+	{
+		XLogRecPtr	donor_lsn;
+
+		/* no connection yet; start one */
+		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
+
+		if (donor == NULL)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "failed to establish remote connection to fetch WAL: no donor available");
+			return NEON_WALREAD_ERROR;
+		}
+		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
+		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
+				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
+		state->wp_conn = libpqwp_connect_start(donor->conninfo);
+		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "failed to connect to %s to fetch WAL: immediately failed with %s",
+					 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+		/* we'll poll immediately */
+		state->rem_state = RS_CONNECTING_READ;
+	}
+
+	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
+	{
+		switch (PQconnectPoll(state->wp_conn->pg_conn))
+		{
+			case PGRES_POLLING_FAILED:
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "failed to connect to %s to fetch WAL: poll error: %s",
+						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+			case PGRES_POLLING_READING:
+				state->rem_state = RS_CONNECTING_READ;
+				return NEON_WALREAD_WOULDBLOCK;
+			case PGRES_POLLING_WRITING:
+				state->rem_state = RS_CONNECTING_WRITE;
+				return NEON_WALREAD_WOULDBLOCK;
+			case PGRES_POLLING_OK:
+				{
+					/* connection successfully established */
+					char		start_repl_query[128];
+
+					snprintf(start_repl_query, sizeof(start_repl_query),
+							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
+							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
+					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
+							state->donor_name, start_repl_query);
+					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "failed to send %s query to %s: %s",
+								 start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+						NeonWALReaderResetRemote(state);
+						return NEON_WALREAD_ERROR;
+					}
+					state->rem_state = RS_WAIT_EXEC_RESULT;
+					break;
+				}
+
+			default:			/* there is unused PGRES_POLLING_ACTIVE */
+				Assert(false);
+				return NEON_WALREAD_ERROR;	/* keep the compiler quiet */
+		}
+	}
+
+	if (state->rem_state == RS_WAIT_EXEC_RESULT)
+	{
+		switch (libpqwp_get_query_result(state->wp_conn))
+		{
+			case WP_EXEC_SUCCESS_COPYBOTH:
+				state->rem_state = RS_ESTABLISHED;
+				break;
+			case WP_EXEC_NEEDS_INPUT:
+				return NEON_WALREAD_WOULDBLOCK;
+			case WP_EXEC_FAILED:
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "get START_REPLICATION result from %s failed: %s",
+						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+			default:			/* can't happen */
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "get START_REPLICATION result from %s: unexpected result",
+						 state->donor_name);
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+		}
+	}
+
+	Assert(state->rem_state == RS_ESTABLISHED);
+
+	/*
+	 * If we had the request before, verify args are the same and advance the
+	 * result ptr according to the progress; otherwise register the request.
+	 */
+	if (state->req_lsn != InvalidXLogRecPtr)
+	{
+		if (state->req_lsn != startptr || state->req_len != count)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "args changed during request, was %X/%X %zu, now %X/%X %zu",
+					 LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+		nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
+				LSN_FORMAT_ARGS(startptr),
+				count,
+				state->req_progress);
+		buf += state->req_progress;
+	}
+	else
+	{
+		state->req_lsn = startptr;
+		state->req_len = count;
+		state->req_progress = 0;
+		nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
+				LSN_FORMAT_ARGS(startptr),
+				count);
+	}
+
+	while (true)
+	{
+		Size		to_copy;
+
+		/*
+		 * If we have no ready data, receive new message.
+		 */
+		if (state->wal_rem_len == 0 &&
+
+		/*
+		 * check for the sake of 0 length reads; walproposer does these for
+		 * heartbeats, though generally they shouldn't hit remote source.
+		 */
+			state->req_len - state->req_progress > 0)
+		{
+			NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
+
+			if (read_msg_res != NEON_WALREAD_SUCCESS)
+				return read_msg_res;
+		}
+
+		if (state->req_lsn + state->req_progress != state->rem_lsn)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
+					 LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
+					 LSN_FORMAT_ARGS(state->rem_lsn),
+					 LSN_FORMAT_ARGS(state->req_lsn),
+					 state->req_len);
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+
+		/* We can copy min of (available, requested) bytes. */
+		to_copy =
+			Min(state->req_len - state->req_progress, state->wal_rem_len);
+		memcpy(buf, state->wal_ptr, to_copy);
+		state->wal_ptr += to_copy;
+		state->wal_rem_len -= to_copy;
+		state->rem_lsn += to_copy;
+		if (state->wal_rem_len == 0)
+			state->wal_ptr = NULL;	/* freed by libpqwalproposer */
+		buf += to_copy;
+		state->req_progress += to_copy;
+		if (state->req_progress == state->req_len)
+		{
+			XLogSegNo	next_segno;
+			XLogSegNo	req_segno;
+
+			XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
+			XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
+
+			/*
+			 * Request completed. If there is a chance of serving next one
+			 * locally, close the connection.
+			 */
+			if (state->req_lsn < state->available_lsn &&
+				state->rem_lsn >= state->available_lsn)
+			{
+				nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
+						LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
+				NeonWALReaderResetRemote(state);
+			}
+			else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
+					 is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
+			{
+				nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
+						LSN_FORMAT_ARGS(state->rem_lsn));
+				NeonWALReaderResetRemote(state);
+			}
+			state->req_lsn = InvalidXLogRecPtr;
+			state->req_len = 0;
+			state->req_progress = 0;
+			return NEON_WALREAD_SUCCESS;
+		}
+	}
+}
+
+/*
+ * Read one WAL message from the stream, sets state->wal_ptr in case of success.
+ * Resets remote state in case of failure.
+ */
+static NeonWALReadResult
+NeonWALReaderReadMsg(NeonWALReader *state)
+{
+	while (true)				/* loop until we get 'w' */
+	{
+		char	   *copydata_ptr;
+		int			copydata_size;
+		StringInfoData s;
+		char		msg_type;
+		int			hdrlen;
+
+		Assert(state->rem_state == RS_ESTABLISHED);
+		Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
+
+		switch (libpqwp_async_read(state->wp_conn,
+								   &copydata_ptr,
+								   &copydata_size))
+		{
+			case PG_ASYNC_READ_SUCCESS:
+				break;
+			case PG_ASYNC_READ_TRY_AGAIN:
+				return NEON_WALREAD_WOULDBLOCK;
+			case PG_ASYNC_READ_FAIL:
+				snprintf(state->err_msg,
+						 sizeof(state->err_msg),
+						 "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
+						 LSN_FORMAT_ARGS(state->req_lsn),
+						 state->req_len,
+						 state->req_progress,
+						 PQerrorMessage(state->wp_conn->pg_conn));
+				goto err;
+		}
+
+		/* put data on StringInfo to parse */
+		s.data = copydata_ptr;
+		s.len = copydata_size;
+		s.cursor = 0;
+		s.maxlen = -1;
+
+		if (copydata_size == 0)
+		{
+			snprintf(state->err_msg,
+					 sizeof(state->err_msg),
+					 "zero length copydata received");
+			goto err;
+		}
+		msg_type = pq_getmsgbyte(&s);
+		switch (msg_type)
+		{
+			case 'w':
+				{
+					XLogRecPtr	start_lsn;
+
+					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
+					if (s.len - s.cursor < hdrlen)
+					{
+						snprintf(state->err_msg,
+								 sizeof(state->err_msg),
+								 "invalid WAL message received from primary");
+						goto err;
+					}
+
+					start_lsn = pq_getmsgint64(&s);
+					pq_getmsgint64(&s); /* XLogRecPtr	end_lsn; */
+					pq_getmsgint64(&s); /* TimestampTz send_time */
+
+					state->rem_lsn = start_lsn;
+					state->wal_rem_len = (Size) (s.len - s.cursor);
+					state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
+					nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
+							LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
+
+					return NEON_WALREAD_SUCCESS;
+				}
+			case 'k':
+				{
+					XLogRecPtr	end_lsn;
+					bool		reply_requested;
+
+					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
+					if (s.len - s.cursor < hdrlen)
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "invalid keepalive message received from primary");
+						goto err;
+					}
+
+					end_lsn = pq_getmsgint64(&s);
+					pq_getmsgint64(&s); /* TimestampTz timestamp; */
+					reply_requested = pq_getmsgbyte(&s);
+					nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
+							LSN_FORMAT_ARGS(end_lsn),
+							reply_requested);
+					if (end_lsn < state->req_lsn + state->req_len)
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
+								 LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
+						goto err;
+					}
+					continue;
+				}
+			default:
+				nwr_log(WARNING, "invalid replication message type %d", msg_type);
+				continue;
+		}
+	}
+err:
+	NeonWALReaderResetRemote(state);
+	return NEON_WALREAD_ERROR;
+}
+
+/* reset remote connection and request in progress */
+static void
+NeonWALReaderResetRemote(NeonWALReader *state)
+{
+	state->req_lsn = InvalidXLogRecPtr;
+	state->req_len = 0;
+	state->req_progress = 0;
+	state->rem_state = RS_NONE;
+	if (state->wp_conn)
+	{
+		libpqwp_disconnect(state->wp_conn);
+		state->wp_conn = NULL;
+	}
+	state->donor_name[0] = '\0';
+	state->wal_ptr = NULL;
+	state->wal_rem_len = 0;
+	state->rem_lsn = InvalidXLogRecPtr;
+}
+
+/*
+ * Return socket of connection to remote source. Must be called only when
+ * connection exists (NeonWALReaderEvents returns non zero).
+ */
+pgsocket
+NeonWALReaderSocket(NeonWALReader *state)
+{
+	if (!state->wp_conn)
+		nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
+	return PQsocket(state->wp_conn->pg_conn);
+}
+
+/*
+ * Returns events user should wait on connection socket or 0 if remote
+ * connection is not active.
+ */
+extern uint32
+NeonWALReaderEvents(NeonWALReader *state)
+{
+	switch (state->rem_state)
+	{
+		case RS_NONE:
+			return 0;
+		case RS_CONNECTING_READ:
+			return WL_SOCKET_READABLE;
+		case RS_CONNECTING_WRITE:
+			return WL_SOCKET_WRITEABLE;
+		case RS_WAIT_EXEC_RESULT:
+		case RS_ESTABLISHED:
+			return WL_SOCKET_READABLE;
+		default:
+			Assert(false);
+			return 0;			/* make compiler happy */
+	}
+}
+
+static bool
+NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+		/*
+		 * If the data we want is not in a segment we have open, close what we
+		 * have (if anything) and open the next one, using the caller's
+		 * provided openSegment callback.
+		 */
+		if (state->seg.ws_file < 0 ||
+			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
+			tli != state->seg.ws_tli)
+		{
+			XLogSegNo	nextSegNo;
+
+			neon_wal_segment_close(state);
+
+			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
+			if (!neon_wal_segment_open(state, nextSegNo, &tli))
+			{
+				char		fname[MAXFNAMELEN];
+
+				state->wre_errno = errno;
+
+				XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
+				snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
+						 fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
+				return false;
+			}
+
+			/* This shouldn't happen -- indicates a bug in segment_open */
+			Assert(state->seg.ws_file >= 0);
+
+			/* Update the current segment info. */
+			state->seg.ws_tli = tli;
+			state->seg.ws_segno = nextSegNo;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (state->segcxt.ws_segsize - startoff))
+			segbytes = state->segcxt.ws_segsize - startoff;
+		else
+			segbytes = nbytes;
+
+#ifndef FRONTEND
+		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+#endif
+
+		/* Reset errno first; eases reporting non-errno-affecting errors */
+		errno = 0;
+		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
+
+#ifndef FRONTEND
+		pgstat_report_wait_end();
+#endif
+
+		if (readbytes <= 0)
+		{
+			char		fname[MAXFNAMELEN];
+
+			XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+
+			if (readbytes < 0)
+			{
+				state->wre_errno = errno;
+				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
+						 fname, startoff, strerror(state->wre_errno));
+			}
+			else
+			{
+				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
+						 fname, startoff);
+			}
+			return false;
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+
+	return true;
+}
+
+/*
+ * Copy of vanilla wal_segment_open, but returns false in case of error instead
+ * of ERROR, with errno set.
+ *
+ * XLogReaderRoutine->segment_open callback for local pg_wal files
+ */
+static bool
+neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
+					  TimeLineID *tli_p)
+{
+	TimeLineID	tli = *tli_p;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
+	nwr_log(DEBUG5, "opening %s", path);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return true;
+
+	return false;
+}
+
+static bool
+is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
+{
+	struct stat stat_buffer;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, segno, segsize);
+	return stat(path, &stat_buffer) == 0;
+}
+
+/* copy of vanilla wal_segment_close with NeonWALReader */
+static void
+neon_wal_segment_close(NeonWALReader *state)
+{
+	if (state->seg.ws_file >= 0)
+	{
+		close(state->seg.ws_file);
+		/* need to check errno? */
+		state->seg.ws_file = -1;
+	}
+}
+
+char *
+NeonWALReaderErrMsg(NeonWALReader *state)
+{
+	return state->err_msg;
+}
diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h
new file mode 100644
index 0000000000..805c94fc53
--- /dev/null
+++ b/pgxn/neon/neon_walreader.h
@@ -0,0 +1,29 @@
+#ifndef __NEON_WALREADER_H__
+#define __NEON_WALREADER_H__
+
+#include "access/xlogdefs.h"
+
+/* forward declare so we don't have to expose the struct to the public */
+struct NeonWALReader;
+typedef struct NeonWALReader NeonWALReader;
+
+/* avoid including walproposer.h as it includes us */
+struct WalProposer;
+typedef struct WalProposer WalProposer;
+
+/* NeonWALRead return value */
+typedef enum
+{
+	NEON_WALREAD_SUCCESS,
+	NEON_WALREAD_WOULDBLOCK,
+	NEON_WALREAD_ERROR,
+} NeonWALReadResult;
+
+extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
+extern void NeonWALReaderFree(NeonWALReader *state);
+extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
+extern uint32 NeonWALReaderEvents(NeonWALReader *state);
+extern char *NeonWALReaderErrMsg(NeonWALReader *state);
+
+#endif							/* __NEON_WALREADER_H__ */
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index fc3332612c..4fb9a46d15 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -45,7 +45,6 @@
 
 /* Prototypes for private functions */
 static void WalProposerLoop(WalProposer *wp);
-static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
 static void ShutdownConnection(Safekeeper *sk);
 static void ResetConnection(Safekeeper *sk);
 static long TimeToReconnect(WalProposer *wp, TimestampTz now);
@@ -78,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
 static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
 static bool AsyncFlush(Safekeeper *sk);
 static int	CompareLsn(const void *a, const void *b);
-static char *FormatSafekeeperState(SafekeeperState state);
+static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
-static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 
+
 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 {
@@ -113,6 +112,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
 		wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
+		wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
 		wp->safekeeper[wp->n_safekeepers].wp = wp;
 
 		{
@@ -127,8 +127,6 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		}
 
 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
-		wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
-		wp->safekeeper[wp->n_safekeepers].flushWrite = false;
 		wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
 		wp->n_safekeepers += 1;
@@ -277,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
+								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -305,58 +303,20 @@ WalProposerLoop(WalProposer *wp)
 		WalProposerPoll(wp);
 }
 
-/*
- * Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
- *
- * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
- */
-static void
-HackyRemoveWalProposerEvent(Safekeeper *to_remove)
-{
-	WalProposer *wp = to_remove->wp;
-
-	/* Remove the existing event set, assign sk->eventPos = -1 */
-	wp->api.free_event_set(wp);
-	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp);
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		uint32		desired_events = WL_NO_EVENTS;
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		if (sk == to_remove)
-			continue;
-
-		/* If this safekeeper isn't offline, add an event for it! */
-		if (sk->state != SS_OFFLINE)
-		{
-			desired_events = SafekeeperStateDesiredEvents(sk->state);
-			/* will set sk->eventPos */
-			wp->api.add_safekeeper_event_set(sk, desired_events);
-		}
-	}
-}
 
 /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
 static void
 ShutdownConnection(Safekeeper *sk)
 {
-	sk->wp->api.conn_finish(sk);
 	sk->state = SS_OFFLINE;
-	sk->flushWrite = false;
 	sk->streamingAt = InvalidXLogRecPtr;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;
 
-	HackyRemoveWalProposerEvent(sk);
+	sk->wp->api.conn_finish(sk);
+	sk->wp->api.rm_safekeeper_event_set(sk);
 }
 
 /*
@@ -474,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
+#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
+#endif
 
 	/*
 	 * Sanity check. We assume further down that the operations don't block
@@ -527,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -556,7 +518,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -622,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	 * Because PQconnectPoll can change the socket, we have to un-register the
 	 * old event and re-register an event on the new socket.
 	 */
-	HackyRemoveWalProposerEvent(sk);
+	wp->api.rm_safekeeper_event_set(sk);
 	wp->api.add_safekeeper_event_set(sk, new_events);
 
 	/* If we successfully connected, send START_WAL_PUSH query */
@@ -1112,6 +1074,9 @@ SendProposerElected(Safekeeper *sk)
 	term_t		lastCommonTerm;
 	int			i;
 
+	/* Now that we are ready to send it's a good moment to create WAL reader */
+	wp->api.wal_reader_allocate(sk);
+
 	/*
 	 * Determine start LSN by comparing safekeeper's log term switch history
 	 * and proposer's, searching for the divergence point.
@@ -1231,6 +1196,7 @@ StartStreaming(Safekeeper *sk)
 	 * once for a connection.
 	 */
 	sk->state = SS_ACTIVE;
+	sk->active_state = SS_ACTIVE_SEND;
 	sk->streamingAt = sk->startStreamingAt;
 
 	/* event set will be updated inside SendMessageToNode */
@@ -1289,9 +1255,13 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 {
 	WalProposer *wp = sk->wp;
 
-	uint32		newEvents = WL_SOCKET_READABLE;
-
-	if (events & WL_SOCKET_WRITEABLE)
+	/*
+	 * Note: we don't known which socket awoke us (sk or nwr). However, as
+	 * SendAppendRequests always tries to send at least one msg in
+	 * SS_ACTIVE_SEND be careful not to go there if are only after sk
+	 * response, otherwise it'd create busy loop of pings.
+	 */
+	if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL)
 		if (!SendAppendRequests(sk))
 			return;
 
@@ -1299,28 +1269,29 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 		if (!RecvAppendResponses(sk))
 			return;
 
-	/*
-	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
-	 * in the buffer.
-	 *
-	 * LSN comparison checks if we have pending unsent messages. This check
-	 * isn't necessary now, because we always send append messages immediately
-	 * after arrival. But it's good to have it here in case we change this
-	 * behavior in the future.
-	 */
-	if (sk->streamingAt != wp->availableLsn || sk->flushWrite)
-		newEvents |= WL_SOCKET_WRITEABLE;
+#if PG_VERSION_NUM >= 150000
+	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
+	if (events & WL_SOCKET_CLOSED)
+	{
+		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+					sk->host, sk->port);
+		ShutdownConnection(sk);
+		return;
+	}
+#endif
 
-	wp->api.update_event_set(sk, newEvents);
+	/* configures event set for yield whatever is the substate */
+	wp->api.active_state_update_event_set(sk);
 }
 
 /*
  * Send WAL messages starting from sk->streamingAt until the end or non-writable
- * socket, whichever comes first. Caller should take care of updating event set.
- * Even if no unsent WAL is available, at least one empty message will be sent
- * as a heartbeat, if socket is ready.
+ * socket or neon_walreader blocks, whichever comes first; active_state is
+ * updated accordingly. Caller should take care of updating event set. Even if
+ * no unsent WAL is available, at least one empty message will be sent as a
+ * heartbeat, if socket is ready.
  *
- * Can change state if Async* functions encounter errors and reset connection.
+ * Resets state and kills the connections if any error on them is encountered.
  * Returns false in this case, true otherwise.
  */
 static bool
@@ -1328,11 +1299,11 @@ SendAppendRequests(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
 	XLogRecPtr	endLsn;
-	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
 	bool		sentAnything = false;
+	AppendRequestHeader *req;
 
-	if (sk->flushWrite)
+	if (sk->active_state == SS_ACTIVE_FLUSH)
 	{
 		if (!AsyncFlush(sk))
 
@@ -1343,76 +1314,101 @@ SendAppendRequests(Safekeeper *sk)
 			return sk->state == SS_ACTIVE;
 
 		/* Event set will be updated in the end of HandleActiveState */
-		sk->flushWrite = false;
+		sk->active_state = SS_ACTIVE_SEND;
 	}
 
 	while (sk->streamingAt != wp->availableLsn || !sentAnything)
 	{
-		sentAnything = true;
-
-		endLsn = sk->streamingAt;
-		endLsn += MAX_SEND_SIZE;
-
-		/* if we went beyond available WAL, back off */
-		if (endLsn > wp->availableLsn)
+		if (sk->active_state == SS_ACTIVE_SEND)
 		{
-			endLsn = wp->availableLsn;
+			sentAnything = true;
+
+			endLsn = sk->streamingAt;
+			endLsn += MAX_SEND_SIZE;
+
+			/* if we went beyond available WAL, back off */
+			if (endLsn > wp->availableLsn)
+			{
+				endLsn = wp->availableLsn;
+			}
+
+			req = &sk->appendRequest;
+			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
+
+			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+
+			resetStringInfo(&sk->outbuf);
+
+			/* write AppendRequest header */
+			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
+			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+			sk->active_state = SS_ACTIVE_READ_WAL;
 		}
 
-		req = &sk->appendRequest;
-		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
-
-		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-					req->endLsn - req->beginLsn,
-					LSN_FORMAT_ARGS(req->beginLsn),
-					LSN_FORMAT_ARGS(req->endLsn),
-					LSN_FORMAT_ARGS(req->commitLsn),
-					LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
-
-		resetStringInfo(&sk->outbuf);
-
-		/* write AppendRequest header */
-		appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
-
-		/* write the WAL itself */
-		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
-		/* wal_read will raise error on failure */
-		wp->api.wal_read(sk,
-						 &sk->outbuf.data[sk->outbuf.len],
-						 req->beginLsn,
-						 req->endLsn - req->beginLsn);
-		sk->outbuf.len += req->endLsn - req->beginLsn;
-
-		writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
-
-		/* Mark current message as sent, whatever the result is */
-		sk->streamingAt = endLsn;
-
-		switch (writeResult)
+		if (sk->active_state == SS_ACTIVE_READ_WAL)
 		{
-			case PG_ASYNC_WRITE_SUCCESS:
-				/* Continue writing the next message */
-				break;
+			char	   *errmsg;
 
-			case PG_ASYNC_WRITE_TRY_FLUSH:
+			req = &sk->appendRequest;
 
-				/*
-				 * * We still need to call PQflush some more to finish the
-				 * job. Caller function will handle this by setting right
-				 * event* set.
-				 */
-				sk->flushWrite = true;
-				return true;
+			switch (wp->api.wal_read(sk,
+									 &sk->outbuf.data[sk->outbuf.len],
+									 req->beginLsn,
+									 req->endLsn - req->beginLsn,
+									 &errmsg))
+			{
+				case NEON_WALREAD_SUCCESS:
+					break;
+				case NEON_WALREAD_WOULDBLOCK:
+					return true;
+				case NEON_WALREAD_ERROR:
+					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
+								sk->host, sk->port, errmsg);
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
+			}
 
-			case PG_ASYNC_WRITE_FAIL:
-				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-							sk->host, sk->port, FormatSafekeeperState(sk->state),
-							wp->api.conn_error_message(sk));
-				ShutdownConnection(sk);
-				return false;
-			default:
-				Assert(false);
-				return false;
+			sk->outbuf.len += req->endLsn - req->beginLsn;
+
+			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
+
+			/* Mark current message as sent, whatever the result is */
+			sk->streamingAt = req->endLsn;
+
+			switch (writeResult)
+			{
+				case PG_ASYNC_WRITE_SUCCESS:
+					/* Continue writing the next message */
+					sk->active_state = SS_ACTIVE_SEND;
+					break;
+
+				case PG_ASYNC_WRITE_TRY_FLUSH:
+
+					/*
+					 * We still need to call PQflush some more to finish the
+					 * job. Caller function will handle this by setting right
+					 * event set.
+					 */
+					sk->active_state = SS_ACTIVE_FLUSH;
+					return true;
+
+				case PG_ASYNC_WRITE_FAIL:
+					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+								sk->host, sk->port, FormatSafekeeperState(sk),
+								wp->api.conn_error_message(sk));
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
+					return false;
+			}
 		}
 	}
 
@@ -1422,7 +1418,7 @@ SendAppendRequests(Safekeeper *sk)
 /*
  * Receive and process all available feedback.
  *
- * Can change state if Async* functions encounter errors and reset connection.
+ * Resets state and kills the connection if any error on it is encountered.
  * Returns false in this case, true otherwise.
  *
  * NB: This function can call SendMessageToNode and produce new messages.
@@ -1608,6 +1604,53 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 	return responses[wp->n_safekeepers - wp->quorum];
 }
 
+/*
+ * Return safekeeper with active connection from which WAL can be downloaded, or
+ * none if it doesn't exist. donor_lsn is set to end position of the donor to
+ * the best of our knowledge.
+ */
+Safekeeper *
+GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
+{
+	*donor_lsn = InvalidXLogRecPtr;
+	Safekeeper *donor = NULL;
+	int			i;
+
+	if (wp->n_votes < wp->quorum)
+	{
+		walprop_log(WARNING, "GetDonor called before elections are won");
+		return NULL;
+	}
+
+	/*
+	 * First, consider node which had determined our term start LSN as we know
+	 * about its position immediately after election before any feedbacks are
+	 * sent.
+	 */
+	if (wp->safekeeper[wp->donor].state >= SS_IDLE)
+	{
+		donor = &wp->safekeeper[wp->donor];
+		*donor_lsn = wp->propEpochStartLsn;
+	}
+
+	/*
+	 * But also check feedbacks from all nodes with live connections and take
+	 * the highest one. Note: if node sends feedbacks it already processed
+	 * elected message so its term is fine.
+	 */
+	for (i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
+		{
+			donor = sk;
+			*donor_lsn = sk->appendResponse.flushLsn;
+		}
+	}
+	return donor;
+}
+
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
@@ -1713,7 +1756,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 
 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk->state),
+						sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
@@ -1753,7 +1796,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk->state));
+					sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1824,12 +1867,13 @@ static bool
 BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
 {
 	WalProposer *wp = sk->wp;
-	uint32		events;
+	uint32		sk_events;
+	uint32		nwr_events;
 
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					sk->host, sk->port, FormatSafekeeperState(sk),
 					wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
@@ -1841,9 +1885,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	 * If the new state will be waiting for events to happen, update the event
 	 * set to wait for those
 	 */
-	events = SafekeeperStateDesiredEvents(success_state);
-	if (events)
-		wp->api.update_event_set(sk, events);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * nwr_events is relevant only during SS_ACTIVE which doesn't use
+	 * BlockingWrite
+	 */
+	Assert(!nwr_events);
+	if (sk_events)
+		wp->api.update_event_set(sk, sk_events);
 
 	return true;
 }
@@ -1876,7 +1926,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						sk->host, sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
@@ -1915,7 +1965,7 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						sk->host, sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
@@ -1945,18 +1995,18 @@ CompareLsn(const void *a, const void *b)
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
  */
 static char *
-FormatSafekeeperState(SafekeeperState state)
+FormatSafekeeperState(Safekeeper *sk)
 {
 	char	   *return_val = NULL;
 
-	switch (state)
+	switch (sk->state)
 	{
 		case SS_OFFLINE:
 			return_val = "offline";
@@ -1984,7 +2034,18 @@ FormatSafekeeperState(SafekeeperState state)
 			return_val = "idle";
 			break;
 		case SS_ACTIVE:
-			return_val = "active";
+			switch (sk->active_state)
+			{
+				case SS_ACTIVE_SEND:
+					return_val = "active send";
+					break;
+				case SS_ACTIVE_READ_WAL:
+					return_val = "active read WAL";
+					break;
+				case SS_ACTIVE_FLUSH:
+					return_val = "active flush";
+					break;
+			}
 			break;
 	}
 
@@ -1997,22 +2058,21 @@ FormatSafekeeperState(SafekeeperState state)
 static void
 AssertEventsOkForState(uint32 events, Safekeeper *sk)
 {
-	WalProposer *wp = sk->wp;
-	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/*
-	 * The events are in-line with what we're expecting, under two conditions:
-	 * (a) if we aren't expecting anything, `events` has no read- or
-	 * write-ready component. (b) if we are expecting something, there's
-	 * overlap (i.e. `events & expected != 0`)
-	 */
+	uint32		sk_events;
+	uint32		nwr_events;
+	uint32		expected;
 	bool		events_ok_for_state;	/* long name so the `Assert` is more
 										 * clear later */
+	WalProposer *wp = sk->wp;
 
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * Without one more level of notify target indirection we have no way to
+	 * distinguish which socket woke up us, so just union expected events.
+	 */
+	expected = sk_events | nwr_events;
+	events_ok_for_state = ((events & expected) != 0);
 
 	if (!events_ok_for_state)
 	{
@@ -2021,36 +2081,39 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
 
-/* Returns the set of events a safekeeper in this state should be waiting on
+/* Returns the set of events for both safekeeper (sk_events) and neon_walreader
+ * (nwr_events) sockets a safekeeper in this state should be waiting on.
  *
  * This will return WL_NO_EVENTS (= 0) for some events. */
-static uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
+void
+SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events)
 {
-	uint32		result = WL_NO_EVENTS;
+	WalProposer *wp = sk->wp;
+
+	*nwr_events = 0;			/* nwr_events is empty for most states */
 
 	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
+	switch (sk->state)
 	{
 			/* Connecting states say what they want in the name */
 		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
+			*sk_events = WL_SOCKET_WRITEABLE;
+			return;
 
 			/* Reading states need the socket to be read-ready to continue */
 		case SS_WAIT_EXEC_RESULT:
 		case SS_HANDSHAKE_RECV:
 		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 
 			/*
 			 * Idle states use read-readiness as a sign that the connection
@@ -2058,32 +2121,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
 			 */
 		case SS_VOTING:
 		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 
-			/*
-			 * Flush states require write-ready for flushing. Active state
-			 * does both reading and writing.
-			 *
-			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
-			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-			 */
 		case SS_SEND_ELECTED_FLUSH:
+			*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			return;
+
 		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
+			switch (sk->active_state)
+			{
+					/*
+					 * Everything is sent; we just wait for sk responses and
+					 * latch.
+					 *
+					 * Note: this assumes we send all available WAL to
+					 * safekeeper in one wakeup (unless it blocks). Otherwise
+					 * we would want WL_SOCKET_WRITEABLE here to finish the
+					 * work.
+					 */
+				case SS_ACTIVE_SEND:
+					*sk_events = WL_SOCKET_READABLE;
+					/* c.f. walprop_pg_active_state_update_event_set */
+#if PG_VERSION_NUM >= 150000
+					if (wp->api.wal_reader_events(sk))
+						*nwr_events = WL_SOCKET_CLOSED;
+#endif							/* on PG 14 nwr_events remains 0 */
+					return;
+
+					/*
+					 * Waiting for neon_walreader socket, but we still read
+					 * responses from sk socket.
+					 */
+				case SS_ACTIVE_READ_WAL:
+					*sk_events = WL_SOCKET_READABLE;
+					*nwr_events = wp->api.wal_reader_events(sk);
+					return;
+
+					/*
+					 * Need to flush the sk socket, so ignore neon_walreader
+					 * one and set write interest on sk.
+					 */
+				case SS_ACTIVE_FLUSH:
+					*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+#if PG_VERSION_NUM >= 150000
+					/* c.f. walprop_pg_active_state_update_event_set */
+					if (wp->api.wal_reader_events(sk))
+						*nwr_events = WL_SOCKET_CLOSED;
+#endif							/* on PG 14 nwr_events remains 0 */
+					return;
+			}
+			return;
 
 			/* The offline state expects no events. */
 		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
+			*sk_events = 0;
+			return;
 
 		default:
 			Assert(false);
-			break;
 	}
-
-	return result;
 }
 
 /* Returns a human-readable string corresponding to the event set
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 6ba2aae75b..a90e87b54f 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -8,6 +8,9 @@
 #include "replication/walreceiver.h"
 #include "utils/uuid.h"
 
+#include "libpqwalproposer.h"
+#include "neon_walreader.h"
+
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
 
@@ -20,43 +23,9 @@
  */
 #define WL_NO_EVENTS 0
 
-struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
+struct WalProposerConn;			/* Defined in libpqwalproposer.h */
 typedef struct WalProposerConn WalProposerConn;
 
-/* Possible return values from ReadPGAsync */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-
-	/*
-	 * The read is ongoing. Wait until the connection is read-ready, then try
-	 * again.
-	 */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from WritePGAsync */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-
-	/*
-	 * The write started, but you'll need to call PQflush some more times to
-	 * finish it off. We just tried, so it's best to wait until the connection
-	 * is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
 /*
  * WAL safekeeper state, which is used to wait for some event.
  *
@@ -133,6 +102,40 @@ typedef enum
 	SS_ACTIVE,
 } SafekeeperState;
 
+/*
+ * Sending WAL substates of SS_ACTIVE.
+ */
+typedef enum
+{
+	/*
+	 * We are ready to send more WAL, waiting for latch set to learn about
+	 * more WAL becoming available (or just a timeout to send heartbeat).
+	 */
+	SS_ACTIVE_SEND,
+
+	/*
+	 * Polling neon_walreader to receive chunk of WAL (probably remotely) to
+	 * send to this safekeeper.
+	 *
+	 * Note: socket management is done completely inside walproposer_pg for
+	 * simplicity, and thus simulation doesn't test it. Which is fine as
+	 * simulation is mainly aimed at consensus checks, not waiteventset
+	 * management.
+	 *
+	 * Also, while in this state we don't touch safekeeper socket, so in
+	 * theory it might close connection as inactive. This can be addressed if
+	 * needed; however, while fetching WAL we should regularly send it, so the
+	 * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
+	 * walreader socket), but similarly shouldn't be a problem.
+	 */
+	SS_ACTIVE_READ_WAL,
+
+	/*
+	 * Waiting for write readiness to flush the socket.
+	 */
+	SS_ACTIVE_FLUSH,
+} SafekeeperActiveState;
+
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
@@ -341,12 +344,11 @@ typedef struct Safekeeper
 	 */
 	XLogRecPtr	startStreamingAt;
 
-	bool		flushWrite;		/* set to true if we need to call AsyncFlush,*
-								 * to flush pending messages */
 	XLogRecPtr	streamingAt;	/* current streaming position */
 	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */
 
 	SafekeeperState state;		/* safekeeper state machine state */
+	SafekeeperActiveState active_state;
 	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
@@ -367,12 +369,17 @@ typedef struct Safekeeper
 	/*
 	 * WAL reader, allocated for each safekeeper.
 	 */
-	XLogReaderState *xlogreader;
+	NeonWALReader *xlogreader;
 
 	/*
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
+
+	/*
+	 * Neon WAL reader position in wait event set, or -1 if no socket.
+	 */
+	int			nwrEventPos;
 #endif
 
 
@@ -401,31 +408,6 @@ typedef enum
 	 */
 } WalProposerConnectPollStatusType;
 
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-
-	/*
-	 * Any success result other than a single CopyBoth was received. The
-	 * specifics of the result were already logged, but it may be useful to
-	 * provide an error message indicating which safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set.
-	 */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-
-	/*
-	 * No result available at this time. Wait until read-ready, then call
-	 * again. Internally, this is returned when PQisBusy indicates that
-	 * PQgetResult would block.
-	 */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
 /* Re-exported ConnStatusType */
 typedef enum
 {
@@ -486,7 +468,7 @@ typedef struct walproposer_api
 	/* Flush buffer to the network, aka PQflush. */
 	int			(*conn_flush) (Safekeeper *sk);
 
-	/* Close the connection, aka PQfinish. */
+	/* Reset sk state: close pq connection, deallocate xlogreader. */
 	void		(*conn_finish) (Safekeeper *sk);
 
 	/*
@@ -506,14 +488,14 @@ typedef struct walproposer_api
 	/* Download WAL from startpos to endpos and make it available locally. */
 	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
 
-	/* Read WAL from disk to buf. */
-	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
-
 	/* Allocate WAL reader. */
 	void		(*wal_reader_allocate) (Safekeeper *sk);
 
-	/* Deallocate event set. */
-	void		(*free_event_set) (WalProposer *wp);
+	/* Read WAL from disk to buf. */
+	NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg);
+
+	/* Returns events to be awaited on WAL reader, if any. */
+	uint32		(*wal_reader_events) (Safekeeper *sk);
 
 	/* Initialize event set. */
 	void		(*init_event_set) (WalProposer *wp);
@@ -521,9 +503,15 @@ typedef struct walproposer_api
 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);
 
+	/* Configure wait event set for yield in SS_ACTIVE. */
+	void		(*active_state_update_event_set) (Safekeeper *sk);
+
 	/* Add a new safekeeper connection to the event set. */
 	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
 
+	/* Remove safekeeper connection from event set */
+	void		(*rm_safekeeper_event_set) (Safekeeper *sk);
+
 	/*
 	 * Wait until some event happens: - timeout is reached - socket event for
 	 * safekeeper connection - new WAL is available
@@ -709,6 +697,13 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);
 
+/*
+ * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
+ * recreate set from scratch, hence the export.
+ */
+extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
+extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
+
 
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index a197f425a6..6199def43f 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -44,10 +44,13 @@
 #include "utils/ps_status.h"
 #include "utils/timestamp.h"
 
-#include "neon.h"
-#include "walproposer.h"
 #include "libpq-fe.h"
 
+#include "libpqwalproposer.h"
+#include "neon.h"
+#include "neon_walreader.h"
+#include "walproposer.h"
+
 #define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
 								 * message header */
@@ -94,6 +97,10 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
 static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);
 
+static void add_nwr_event_set(Safekeeper *sk, uint32 events);
+static void update_nwr_event_set(Safekeeper *sk, uint32 events);
+static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
+
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -543,14 +550,6 @@ walprop_pg_load_libpqwalreceiver(void)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received data from walprop_async_read */
-};
-
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -588,16 +587,17 @@ walprop_status(Safekeeper *sk)
 	}
 }
 
-static void
-walprop_connect_start(Safekeeper *sk)
+WalProposerConn *
+libpqwp_connect_start(char *conninfo)
 {
+
 	PGconn	   *pg_conn;
+	WalProposerConn *conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;
 
-	Assert(sk->conn == NULL);
 
 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
@@ -616,7 +616,7 @@ walprop_connect_start(Safekeeper *sk)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = sk->conninfo;
+	values[n] = conninfo;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -637,11 +637,20 @@ walprop_connect_start(Safekeeper *sk)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	sk->conn = palloc(sizeof(WalProposerConn));
-	sk->conn->pg_conn = pg_conn;
-	sk->conn->is_nonblocking = false;	/* connections always start in
-										 * blocking mode */
-	sk->conn->recvbuf = NULL;
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false;	/* connections always start in blocking
+									 * mode */
+	conn->recvbuf = NULL;
+	return conn;
+}
+
+static void
+walprop_connect_start(Safekeeper *sk)
+{
+	Assert(sk->conn == NULL);
+	sk->conn = libpqwp_connect_start(sk->conninfo);
+
 }
 
 static WalProposerConnectPollStatusType
@@ -685,26 +694,33 @@ walprop_connect_poll(Safekeeper *sk)
 	return return_val;
 }
 
-static bool
-walprop_send_query(Safekeeper *sk, char *query)
+extern bool
+libpqwp_send_query(WalProposerConn *conn, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
 	 * requiring a call to PQflush
 	 */
-	if (!ensure_nonblocking_status(sk->conn, false))
+	if (!ensure_nonblocking_status(conn, false))
 		return false;
 
 	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(sk->conn->pg_conn, query))
+	if (!PQsendQuery(conn->pg_conn, query))
 		return false;
 
 	return true;
 }
 
-static WalProposerExecStatusType
-walprop_get_query_result(Safekeeper *sk)
+static bool
+walprop_send_query(Safekeeper *sk, char *query)
 {
+	return libpqwp_send_query(sk->conn, query);
+}
+
+WalProposerExecStatusType
+libpqwp_get_query_result(WalProposerConn *conn)
+{
+
 	PGresult   *result;
 	WalProposerExecStatusType return_val;
 
@@ -712,14 +728,14 @@ walprop_get_query_result(Safekeeper *sk)
 	char	   *unexpected_success = NULL;
 
 	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(sk->conn->pg_conn))
+	if (!PQconsumeInput(conn->pg_conn))
 		return WP_EXEC_FAILED;
 
-	if (PQisBusy(sk->conn->pg_conn))
+	if (PQisBusy(conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;
 
 
-	result = PQgetResult(sk->conn->pg_conn);
+	result = PQgetResult(conn->pg_conn);
 
 	/*
 	 * PQgetResult returns NULL only if getting the result was successful &
@@ -780,6 +796,12 @@ walprop_get_query_result(Safekeeper *sk)
 	return return_val;
 }
 
+static WalProposerExecStatusType
+walprop_get_query_result(Safekeeper *sk)
+{
+	return libpqwp_get_query_result(sk->conn);
+}
+
 static pgsocket
 walprop_socket(Safekeeper *sk)
 {
@@ -792,38 +814,21 @@ walprop_flush(Safekeeper *sk)
 	return (PQflush(sk->conn->pg_conn));
 }
 
-static void
-walprop_finish(Safekeeper *sk)
+/* Like libpqrcv_receive. *buf is valid until the next call. */
+PGAsyncReadResult
+libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 {
-	if (!sk->conn)
-		return;
 
-	if (sk->conn->recvbuf != NULL)
-		PQfreemem(sk->conn->recvbuf);
-	PQfinish(sk->conn->pg_conn);
-	pfree(sk->conn);
-	sk->conn = NULL;
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-walprop_async_read(Safekeeper *sk, char **buf, int *amount)
-{
 	int			result;
 
-	if (sk->conn->recvbuf != NULL)
+	if (conn->recvbuf != NULL)
 	{
-		PQfreemem(sk->conn->recvbuf);
-		sk->conn->recvbuf = NULL;
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
 	}
 
 	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(sk->conn->pg_conn))
+	if (!PQconsumeInput(conn->pg_conn))
 	{
 		*amount = 0;
 		*buf = NULL;
@@ -841,7 +846,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
+	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
 	{
 		case 0:
 			*amount = 0;
@@ -856,7 +861,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 				 * We can check PQgetResult to make sure that the server
 				 * failed; it'll always result in PGRES_FATAL_ERROR
 				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
+				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
 
 				if (status != PGRES_FATAL_ERROR)
 					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
@@ -877,11 +882,23 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 		default:
 			/* Positive values indicate the size of the returned result */
 			*amount = result;
-			*buf = sk->conn->recvbuf;
+			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }
 
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+walprop_async_read(Safekeeper *sk, char **buf, int *amount)
+{
+	return libpqwp_async_read(sk->conn, buf, amount);
+}
+
 static PGAsyncWriteResult
 walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 {
@@ -964,6 +981,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
 	return true;
 }
 
+void
+libpqwp_disconnect(WalProposerConn *conn)
+{
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+static void
+walprop_finish(Safekeeper *sk)
+{
+	if (sk->conn)
+	{
+		libpqwp_disconnect(sk->conn);
+		sk->conn = NULL;
+	}
+
+	/* free xlogreader */
+	if (sk->xlogreader)
+	{
+		NeonWALReaderFree(sk->xlogreader);
+		sk->xlogreader = NULL;
+	}
+	rm_safekeeper_event_set(sk, false);
+}
+
 /*
  * Subscribe for new WAL and stream it in the loop to safekeepers.
  *
@@ -1402,30 +1446,56 @@ XLogWalPropClose(XLogRecPtr recptr)
 	walpropFile = -1;
 }
 
-static void
-walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
-{
-	WALReadError errinfo;
-
-	if (!WALRead(sk->xlogreader,
-				 buf,
-				 startptr,
-				 count,
-				 walprop_pg_get_timeline_id(),
-				 &errinfo))
-	{
-		WALReadRaiseError(&errinfo);
-	}
-}
-
 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
-	sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
+	char		log_prefix[64];
+
+	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
+	Assert(!sk->xlogreader);
+	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
 		elog(FATAL, "Failed to allocate xlog reader");
 }
 
+static NeonWALReadResult
+walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg)
+{
+	NeonWALReadResult res;
+
+	res = NeonWALRead(sk->xlogreader,
+					  buf,
+					  startptr,
+					  count,
+					  walprop_pg_get_timeline_id());
+
+	if (res == NEON_WALREAD_SUCCESS)
+	{
+		/*
+		 * If we have the socket subscribed, but walreader doesn't need any
+		 * events, it must mean that remote connection just closed hoping to
+		 * do next read locally. Remove the socket then. It is important to do
+		 * as otherwise next read might open another connection and we won't
+		 * be able to distinguish whether we have correct socket added in wait
+		 * event set.
+		 */
+		if (NeonWALReaderEvents(sk->xlogreader) == 0)
+			rm_safekeeper_event_set(sk, false);
+	}
+	else if (res == NEON_WALREAD_ERROR)
+	{
+		*errmsg = NeonWALReaderErrMsg(sk->xlogreader);
+	}
+
+	return res;
+}
+
+static uint32
+walprop_pg_wal_reader_events(Safekeeper *sk)
+{
+	return NeonWALReaderEvents(sk->xlogreader);
+}
+
 static WaitEventSet *waitEvents;
 
 static void
@@ -1440,6 +1510,7 @@ walprop_pg_free_event_set(WalProposer *wp)
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		wp->safekeeper[i].eventPos = -1;
+		wp->safekeeper[i].nwrEventPos = -1;
 	}
 }
 
@@ -1449,11 +1520,35 @@ walprop_pg_init_event_set(WalProposer *wp)
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
+	/* for each sk, we have socket plus potentially socket for neon walreader */
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
+
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		wp->safekeeper[i].eventPos = -1;
+		wp->safekeeper[i].nwrEventPos = -1;
+	}
+}
+
+/* add safekeeper socket to wait event set */
+static void
+walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
+{
+	Assert(sk->eventPos == -1);
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
+}
+
+/* add neon wal reader socket to wait event set */
+static void
+add_nwr_event_set(Safekeeper *sk, uint32 events)
+{
+	Assert(sk->nwrEventPos == -1);
+	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
+	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
 static void
@@ -1465,10 +1560,147 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }
 
+/*
+ * Update neon_walreader event.
+ * Can be called when nwr socket doesn't exist, does nothing in this case.
+ */
 static void
-walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
+update_nwr_event_set(Safekeeper *sk, uint32 events)
 {
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
+	/* eventPos = -1 when we don't have an event */
+	if (sk->nwrEventPos != -1)
+		ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
+}
+
+
+static void
+walprop_pg_active_state_update_event_set(Safekeeper *sk)
+{
+	uint32		sk_events;
+	uint32		nwr_events;
+
+	Assert(sk->state == SS_ACTIVE);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * If we need to wait for neon_walreader, ensure we have up to date socket
+	 * in the wait event set.
+	 */
+	if (sk->active_state == SS_ACTIVE_READ_WAL)
+	{
+		/*
+		 * TODO: instead of reattaching socket (and thus recreating WES) each
+		 * time we should keep it if possible, i.e. if connection is already
+		 * established. Note that single neon_walreader object can switch
+		 * between local and remote reads multiple times during its lifetime,
+		 * so careful bookkeeping is needed here.
+		 */
+		rm_safekeeper_event_set(sk, false);
+		add_nwr_event_set(sk, nwr_events);
+	}
+	else
+	{
+		/*
+		 * Hack: we should always set 0 here, but for random reasons
+		 * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
+		 * some event. Since there is also no way to remove socket except
+		 * reconstructing the whole set, SafekeeperStateDesiredEvents instead
+		 * gives WL_SOCKET_CLOSED if socket exists. We never expect it to
+		 * trigger.
+		 *
+		 * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event
+		 * removal.
+		 */
+#if PG_VERSION_NUM >= 150000
+		Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
+		update_nwr_event_set(sk, WL_SOCKET_CLOSED);
+#else							/* pg 14 */
+		rm_safekeeper_event_set(sk, false);
+#endif
+	}
+	walprop_pg_update_event_set(sk, sk_events);
+}
+
+static void
+walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
+{
+	rm_safekeeper_event_set(to_remove, true);
+}
+
+/*
+ * A hacky way to remove single event from the event set. Can be called if event
+ * doesn't exist, does nothing in this case.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be
+ * avoided if possible.
+ *
+ * If is_sk is true, socket of connection to safekeeper is removed; otherwise
+ * socket of neon_walreader.
+ */
+static void
+rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
+{
+	WalProposer *wp = to_remove->wp;
+
+	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+		 to_remove->host, to_remove->port, is_sk);
+
+	/*
+	 * Shortpath for exiting if have nothing to do. We never call this
+	 * function with safekeeper socket not existing, but do that with neon
+	 * walreader socket.
+	 */
+	if ((is_sk && to_remove->eventPos == -1) ||
+		(!is_sk && to_remove->nwrEventPos == -1))
+	{
+		return;
+	}
+
+	/* Remove the existing event set, assign sk->eventPos = -1 */
+	walprop_pg_free_event_set(wp);
+
+	/* Re-initialize it without adding any safekeeper events */
+	wp->api.init_event_set(wp);
+
+	/*
+	 * loop through the existing safekeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		if (sk == to_remove)
+		{
+			if (is_sk)
+				sk->eventPos = -1;
+			else
+				sk->nwrEventPos = -1;
+		}
+
+		/*
+		 * If this safekeeper isn't offline, add events for it, except for the
+		 * event requested to remove.
+		 */
+		if (sk->state != SS_OFFLINE)
+		{
+			uint32		sk_events;
+			uint32		nwr_events;
+
+			SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+			if (sk != to_remove || !is_sk)
+			{
+				/* will set sk->eventPos */
+				wp->api.add_safekeeper_event_set(sk, sk_events);
+			}
+			else if ((sk != to_remove || is_sk) && nwr_events)
+			{
+				add_nwr_event_set(sk, nwr_events);
+			}
+		}
+	}
 }
 
 static int
@@ -1750,12 +1982,14 @@ static const walproposer_api walprop_pg = {
 	.conn_async_write = walprop_async_write,
 	.conn_blocking_write = walprop_blocking_write,
 	.recovery_download = WalProposerRecovery,
-	.wal_read = walprop_pg_wal_read,
 	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
-	.free_event_set = walprop_pg_free_event_set,
+	.wal_read = walprop_pg_wal_read,
+	.wal_reader_events = walprop_pg_wal_reader_events,
 	.init_event_set = walprop_pg_init_event_set,
 	.update_event_set = walprop_pg_update_event_set,
+	.active_state_update_event_set = walprop_pg_active_state_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
+	.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
 	.strong_random = walprop_pg_strong_random,
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,

From 14913c6443f36e9c94cab63698fdfd910a016148 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 8 Dec 2023 18:05:48 +0300
Subject: [PATCH 162/202] Adapt rust walproposer to neon_walreader.

---
 libs/walproposer/src/api_bindings.rs | 61 +++++++++++++++++++---------
 libs/walproposer/src/walproposer.rs  | 37 +++++++++++------
 2 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 77afe1e686..2f633243be 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -8,6 +8,7 @@ use std::ffi::CString;
 
 use crate::bindings::uint32;
 use crate::bindings::walproposer_api;
+use crate::bindings::NeonWALReadResult;
 use crate::bindings::PGAsyncReadResult;
 use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
@@ -191,21 +192,6 @@ extern "C" fn recovery_download(
     }
 }
 
-#[allow(clippy::unnecessary_cast)]
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-) {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_read(&mut (*sk), buf, startptr)
-    }
-}
-
 extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -214,11 +200,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn free_event_set(wp: *mut WalProposer) {
+#[allow(clippy::unnecessary_cast)]
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+    _errmsg: *mut *mut ::std::os::raw::c_char,
+) -> NeonWALReadResult {
     unsafe {
-        let callback_data = (*(*wp).config).callback_data;
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).free_event_set(&mut (*wp));
+        // TODO: errmsg is not forwarded
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_events(&mut (*sk))
     }
 }
 
@@ -238,6 +241,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
     }
 }
 
+extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).active_state_update_event_set(&mut (*sk));
+    }
+}
+
 extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -246,6 +257,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
     }
 }
 
+extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).rm_safekeeper_event_set(&mut (*sk));
+    }
+}
+
 extern "C" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
@@ -401,12 +420,14 @@ pub(crate) fn create_api() -> walproposer_api {
         conn_async_write: Some(conn_async_write),
         conn_blocking_write: Some(conn_blocking_write),
         recovery_download: Some(recovery_download),
-        wal_read: Some(wal_read),
         wal_reader_allocate: Some(wal_reader_allocate),
-        free_event_set: Some(free_event_set),
+        wal_read: Some(wal_read),
+        wal_reader_events: Some(wal_reader_events),
         init_event_set: Some(init_event_set),
         update_event_set: Some(update_event_set),
+        active_state_update_event_set: Some(active_state_update_event_set),
         add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
         wait_event_set: Some(wait_event_set),
         strong_random: Some(strong_random),
         get_redo_start_lsn: Some(get_redo_start_lsn),
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index f5723018d7..013400325d 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
 use crate::{
     api_bindings::{create_api, take_vec_u8, Level},
     bindings::{
-        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
-        WalProposerStart,
+        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
+        WalProposerFree, WalProposerStart,
     },
 };
 
@@ -90,15 +90,15 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
         todo!()
     }
 
-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
         todo!()
     }
 
-    fn free_event_set(&self, _wp: &mut WalProposer) {
+    fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
         todo!()
     }
 
@@ -110,10 +110,18 @@ pub trait ApiImpl {
         todo!()
     }
 
+    fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
     fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
         todo!()
     }
 
+    fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
     fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
         todo!()
     }
@@ -240,6 +248,7 @@ impl Drop for Wrapper {
 
 #[cfg(test)]
 mod tests {
+    use core::panic;
     use std::{
         cell::Cell,
         sync::{atomic::AtomicUsize, mpsc::sync_channel},
@@ -247,7 +256,7 @@ mod tests {
 
     use utils::id::TenantTimelineId;
 
-    use crate::{api_bindings::Level, walproposer::Wrapper};
+    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
 
     use super::ApiImpl;
 
@@ -355,12 +364,9 @@ mod tests {
             true
         }
 
-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("wal_reader_allocate")
-        }
-
-        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("free_event_set")
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
+            println!("wal_reader_allocate");
+            crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
         }
 
         fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
@@ -383,6 +389,13 @@ mod tests {
             self.wait_events.set(WaitEventsData { sk, event_mask });
         }
 
+        fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
+            println!(
+                "rm_safekeeper_event_set, sk={:?}",
+                sk as *mut crate::bindings::Safekeeper
+            );
+        }
+
         fn wait_event_set(
             &self,
             _: &mut crate::bindings::WalProposer,

From df760e6de5c2a398de3f00d7deba97d5db5fded4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 6 Dec 2023 10:12:19 +0300
Subject: [PATCH 163/202] Add test_lagging_sk.

---
 test_runner/fixtures/neon_fixtures.py    |  23 ++
 test_runner/regress/test_wal_acceptor.py | 295 +++++++++++++++++++----
 2 files changed, 267 insertions(+), 51 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a9133f1c9c..597e311e02 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -365,6 +365,12 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
+    def safe_psql_scalar(self, query) -> Any:
+        """
+        Execute query returning single row with single column.
+        """
+        return self.safe_psql(query)[0][0]
+
 
 @dataclass
 class AuthKeys:
@@ -2733,6 +2739,13 @@ class Endpoint(PgProtocol):
     ):
         self.stop()
 
+    # Checkpoints running endpoint and returns pg_wal size in MB.
+    def get_pg_wal_size(self):
+        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
+        self.safe_psql("checkpoint")
+        assert self.pgdata_dir is not None  # please mypy
+        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
+
 
 class EndpointFactory:
     """An object representing multiple compute endpoints."""
@@ -2931,6 +2944,13 @@ class Safekeeper:
         return segments
 
 
+# Walreceiver as returned by sk's timeline status endpoint.
+@dataclass
+class Walreceiver:
+    conn_id: int
+    state: str
+
+
 @dataclass
 class SafekeeperTimelineStatus:
     acceptor_epoch: int
@@ -2941,6 +2961,7 @@ class SafekeeperTimelineStatus:
     backup_lsn: Lsn
     peer_horizon_lsn: Lsn
     remote_consistent_lsn: Lsn
+    walreceivers: List[Walreceiver]
 
 
 @dataclass
@@ -3002,6 +3023,7 @@ class SafekeeperHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
         res.raise_for_status()
         resj = res.json()
+        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         return SafekeeperTimelineStatus(
             acceptor_epoch=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
@@ -3011,6 +3033,7 @@ class SafekeeperHttpClient(requests.Session):
             backup_lsn=Lsn(resj["backup_lsn"]),
             peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
             remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
+            walreceivers=walreceivers,
         )
 
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3c40a9cb3e..5a0856c69c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -419,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None):
         try:
             if f():
                 break
-        except Exception:
+        except Exception as e:
+            log.info(f"got exception while waiting for {desc}: {e}")
             pass
         elapsed = time.time() - started_at
         if elapsed > timeout:
@@ -1001,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
         endpoint.start()
 
 
+# Context manager which logs passed time on exit.
+class DurationLogger:
+    def __init__(self, desc):
+        self.desc = desc
+
+    def __enter__(self):
+        self.ts_before = time.time()
+
+    def __exit__(self, *exc):
+        log.info(f"{self.desc} finished in {time.time() - self.ts_before}s")
+
+
+# Context manager which logs WAL position change on exit.
+class WalChangeLogger:
+    def __init__(self, ep, desc_before):
+        self.ep = ep
+        self.desc_before = desc_before
+
+    def __enter__(self):
+        self.ts_before = time.time()
+        self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
+        log.info(f"{self.desc_before}, lsn_before={self.lsn_before}")
+
+    def __exit__(self, *exc):
+        lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
+        log.info(
+            f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s"
+        )
+
+
 # Test that we can create timeline with one safekeeper down and initialize it
-# later when some data already had been written.
+# later when some data already had been written. It is strictly weaker than
+# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute
+# download (recovery) and as such useful for development/testing.
 def test_late_init(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
@@ -1010,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
     sk1 = env.safekeepers[0]
     sk1.stop()
 
-    # create and insert smth while safekeeper is down...
-    env.neon_cli.create_branch("test_late_init")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_late_init")
     endpoint = env.endpoints.create_start("test_late_init")
+    # create and insert smth while safekeeper is down...
     endpoint.safe_psql("create table t(key int, value text)")
-    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
-    log.info("insert with safekeeper down done")
+    with WalChangeLogger(endpoint, "doing insert with sk1 down"):
+        endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
     endpoint.stop()  # stop compute
 
     # stop another safekeeper, and start one which missed timeline creation
@@ -1024,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
     sk1.start()
 
     # insert some more
-    endpoint = env.endpoints.create_start("test_late_init")
+    with DurationLogger("recovery"):
+        endpoint = env.endpoints.create_start("test_late_init")
     endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
 
+    wait_flush_lsn_align_by_ep(
+        env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]]
+    )
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id)
+
 
 # is timeline flush_lsn equal on provided safekeepers?
-def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
-    status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
-    status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(
-        f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
+def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
+    flush_lsns = [
+        sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
+        for sk_http_cli in sk_http_clis
+    ]
+    log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}")
+    return all([flush_lsns[0] == flsn for flsn in flush_lsns])
+
+
+def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
+    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+    return len(status.walreceivers) == 0
+
+
+# Assert by xxd that WAL on given safekeepers is identical. No compute must be
+# running for this to be reliable.
+def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
+    assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed"
+    sk_http_clis = [sk.http_client() for sk in sks]
+
+    # First check that term / flush_lsn are the same: it is easier to
+    # report/understand if WALs are different due to that.
+    statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
+    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
+        assert (
+            term_flush_lsns[0] == tfl
+        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+
+    # check that WALs are identic.
+    segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
+    for cmp_segs, sk in zip(segs[1:], sks[1:]):
+        assert (
+            segs[0] == cmp_segs
+        ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
+    log.info(f"comparing segs {segs[0]}")
+
+    sk0 = sks[0]
+    for sk in sks[1:]:
+        (_, mismatch, not_regular) = filecmp.cmpfiles(
+            sk0.timeline_dir(tenant_id, timeline_id),
+            sk.timeline_dir(tenant_id, timeline_id),
+            segs[0],
+            shallow=False,
+        )
+        log.info(
+            f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
+        )
+
+        for f in mismatch:
+            f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
+            f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
+            stdout_filename = "{}.filediff".format(f2)
+
+            with open(stdout_filename, "w") as stdout_f:
+                subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
+                subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+
+                cmd = "diff {}.hex {}.hex".format(f1, f2)
+                subprocess.run([cmd], stdout=stdout_f, shell=True)
+
+            assert (mismatch, not_regular) == (
+                [],
+                [],
+            ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic"
+
+
+# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is
+# running. ep is stopped by this function. This is used in tests which check
+# binary equality of WAL segments on safekeepers; which is inherently racy as
+# shutting down endpoint might always write some WAL which can get to only one
+# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if
+# it has changed.
+def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks):
+    sk_http_clis = [sk.http_client() for sk in sks]
+    # First wait for the alignment.
+    wait(
+        partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id),
+        "flush_lsn to get aligned",
     )
-    return status1.flush_lsn == status2.flush_lsn
+    ep.stop()  # then stop endpoint
+    # Even if there is no compute, there might be some in flight data; ensure
+    # all walreceivers die before rechecking.
+    for sk_http_cli in sk_http_clis:
+        wait(
+            partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id),
+            "walreceivers to be gone",
+        )
+    # Now recheck again flush_lsn and exit if it is good
+    if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
+        return
+    # Otherwise repeat.
+    log.info("flush_lsn changed during endpoint shutdown; retrying alignment")
+    ep = env.endpoints.create_start(branch)
 
 
-# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
-# 1) walproposer can't recover node if it misses WAL written by previous computes, but
-#    still starts up and functions normally if two other sks are ok.
-# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
-#    normally if two other sks are ok.
-# 3) Lagged safekeeper can still recover by peer recovery.
-def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
-    pass
+# Test behaviour with one safekeeper down and missing a lot of WAL, exercising
+# neon_walreader and checking that pg_wal never bloats. Namely, ensures that
+# compute doesn't keep many WAL for lagging sk, but still can recover it with
+# neon_walreader, in two scenarious: a) WAL never existed on compute (it started
+# on basebackup LSN later than lagging sk position) though segment file exists
+# b) WAL had been recycled on it and segment file doesn't exist.
+#
+# Also checks along the way that whenever there are two sks alive, compute
+# should be able to commit.
+def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
+    # inserts ~20MB of WAL, a bit more than a segment.
+    def fill_segment(ep):
+        ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'")
+
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    (sk1, sk2, sk3) = env.safekeepers
+
+    # create and insert smth while safekeeper is down...
+    sk1.stop()
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_lagging_sk")
+    ep = env.endpoints.create_start("test_lagging_sk")
+    ep.safe_psql("create table t(key int, value text)")
+    # make small insert to be on the same segment
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
+    log.info("insert with safekeeper down done")
+    ep.stop()  # stop compute
+
+    # Stop another safekeeper, and start one which missed timeline creation.
+    sk2.stop()
+    sk1.start()
+
+    # Start new ep and insert some more. neon_walreader should download WAL for
+    # sk1 because it should be filled since the horizon (initial LSN) which is
+    # earlier than basebackup LSN.
+    ep = env.endpoints.create_start("test_lagging_sk")
+    ep.safe_psql("insert into t select generate_series(1,100), 'payload'")
+    # stop ep and ensure WAL is identical after recovery.
+    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
+
+    # Now repeat insertion with sk1 down, but with inserting more data to check
+    # that WAL on compute is removed.
+    sk1.stop()
+    sk2.start()
+
+    # min_wal_size must be at least 2x segment size.
+    min_wal_config = [
+        "min_wal_size=32MB",
+        "max_wal_size=32MB",
+        "wal_keep_size=0",
+        "log_checkpoints=on",
+    ]
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=min_wal_config,
+    )
+    with WalChangeLogger(ep, "doing large insert with sk1 down"):
+        for _ in range(0, 5):
+            fill_segment(ep)
+    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
+    assert ep.get_pg_wal_size() < 16 * 2.5
+
+    sk2.stop()  # stop another sk to ensure sk1 and sk3 can work
+    sk1.start()
+    with DurationLogger("recovery"):
+        ep.safe_psql("insert into t select generate_series(1,100), 'payload'")  # forces recovery
+    # stop ep and ensure WAL is identical after recovery.
+    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
+
+    # Now do the same with different safekeeper sk2 down, and restarting ep
+    # before recovery (again scenario when recovery starts below basebackup_lsn,
+    # but multi segment now).
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
+    )
+    with WalChangeLogger(ep, "doing large insert with sk2 down"):
+        for _ in range(0, 5):
+            fill_segment(ep)
+    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
+    assert ep.get_pg_wal_size() < 16 * 2.5
+
+    ep.stop()
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=min_wal_config,
+    )
+    sk2.start()
+    with DurationLogger("recovery"):
+        wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
 
 
 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
@@ -1065,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     sk2_http_cli = sk2.http_client()
     # ensure tli gets created on sk1, peer recovery won't do that
     wait(
-        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
         "flush_lsn to get aligned",
     )
 
@@ -1087,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
 
     # wait a bit, lsns shouldn't change
-    # time.sleep(5)
+    time.sleep(2)
     sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
     sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
     log.info(
@@ -1098,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     # now restart safekeeper with peer recovery enabled and wait for recovery
     sk1.stop().start(extra_opts=["--peer-recovery=true"])
     wait(
-        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
         "flush_lsn to get aligned",
     )
 
-    # check that WALs are identic after recovery
-    segs = sk1.list_segments(tenant_id, timeline_id)
-    log.info(f"segs are {segs}")
-
-    (_, mismatch, not_regular) = filecmp.cmpfiles(
-        sk1.timeline_dir(tenant_id, timeline_id),
-        sk2.timeline_dir(tenant_id, timeline_id),
-        segs,
-        shallow=False,
-    )
-    log.info(
-        f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
-    )
-
-    for f in mismatch:
-        f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
-        f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
-        stdout_filename = "{}.filediff".format(f2)
-
-        with open(stdout_filename, "w") as stdout_f:
-            subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
-            subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
-
-            cmd = "diff {}.hex {}.hex".format(f1, f2)
-            subprocess.run([cmd], stdout=stdout_f, shell=True)
-
-    assert (mismatch, not_regular) == ([], [])
+    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
 
     # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
     env.safekeepers[2].stop()

From 9c493869c786ee2e5a4e099bef7f5273b0b68746 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Dec 2023 17:08:36 +0300
Subject: [PATCH 164/202] Perform synchronous WAL download in wp only for
 logical replication.

wp -> sk communication now uses neon_walreader which will fetch missing WAL on
demand from safekeepers, so doesn't need this anymore. Also, cap WAL download by
max_slot_wal_keep_size to be able to start compute if lag is too high.
---
 libs/walproposer/src/api_bindings.rs | 19 +------
 libs/walproposer/src/walproposer.rs  | 10 +++-
 pgxn/neon/walproposer.c              | 28 +++-------
 pgxn/neon/walproposer.h              | 15 ++----
 pgxn/neon/walproposer_pg.c           | 78 +++++++++++++++++++++-------
 5 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 2f633243be..e884f8438a 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -14,7 +14,6 @@ use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
 use crate::bindings::Size;
 use crate::bindings::StringInfoData;
-use crate::bindings::TimeLineID;
 use crate::bindings::TimestampTz;
 use crate::bindings::WalProposer;
 use crate::bindings::WalProposerConnStatusType;
@@ -179,16 +178,11 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(
-    sk: *mut Safekeeper,
-    _timeline: TimeLineID,
-    startpos: XLogRecPtr,
-    endpos: XLogRecPtr,
-) -> bool {
+extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*sk), startpos, endpos)
+        (*api).recovery_download(&mut (*wp), &mut (*sk))
     }
 }
 
@@ -354,14 +348,6 @@ extern "C" fn log_internal(
     }
 }
 
-extern "C" fn after_election(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).after_election(&mut (*wp))
-    }
-}
-
 #[derive(Debug)]
 pub enum Level {
     Debug5,
@@ -435,7 +421,6 @@ pub(crate) fn create_api() -> walproposer_api {
         process_safekeeper_feedback: Some(process_safekeeper_feedback),
         confirm_wal_streamed: Some(confirm_wal_streamed),
         log_internal: Some(log_internal),
-        after_election: Some(after_election),
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 013400325d..87001c9c66 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -86,7 +86,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+    fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
         todo!()
     }
 
@@ -364,6 +364,14 @@ mod tests {
             true
         }
 
+        fn recovery_download(
+            &self,
+            _wp: &mut crate::bindings::WalProposer,
+            _sk: &mut crate::bindings::Safekeeper,
+        ) -> bool {
+            true
+        }
+
         fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
             println!("wal_reader_allocate");
             crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 4fb9a46d15..5874d199f9 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -809,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 	else if (wp->n_votes > wp->quorum)
 	{
-		/* recovery already performed, just start streaming */
+		/* already elected, start streaming */
 		SendProposerElected(sk);
 	}
 	else
@@ -835,21 +835,16 @@ HandleElectedProposer(WalProposer *wp)
 	DetermineEpochStartLsn(wp);
 
 	/*
-	 * Check if not all safekeepers are up-to-date, we need to download WAL
-	 * needed to synchronize them
+	 * Synchronously download WAL from the most advanced safekeeper. We do
+	 * that only for logical replication (and switching logical walsenders to
+	 * neon_walreader is a todo.)
 	 */
-	if (wp->truncateLsn < wp->propEpochStartLsn)
+	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		walprop_log(LOG,
-					"start recovery because truncateLsn=%X/%X is not "
-					"equal to epochStartLsn=%X/%X",
-					LSN_FORMAT_ARGS(wp->truncateLsn),
-					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
-		/* Perform recovery */
-		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
-			walprop_log(FATAL, "Failed to recover state");
+		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
-	else if (wp->config->syncSafekeepers)
+
+	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */
 		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
@@ -1047,13 +1042,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}
-
-	/*
-	 * WalProposer has just elected itself and initialized history, so we can
-	 * call election callback. Usually it updates truncateLsn to fetch WAL for
-	 * logical replication.
-	 */
-	wp->api.after_election(wp);
 }
 
 /*
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index a90e87b54f..2b2c252a18 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -485,8 +485,11 @@ typedef struct walproposer_api
 	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
 	bool		(*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
 
-	/* Download WAL from startpos to endpos and make it available locally. */
-	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+	/*
+	 * Download WAL before basebackup for logical walsenders from sk, if
+	 * needed
+	 */
+	bool		(*recovery_download) (WalProposer *wp, Safekeeper *sk);
 
 	/* Allocate WAL reader. */
 	void		(*wal_reader_allocate) (Safekeeper *sk);
@@ -556,14 +559,6 @@ typedef struct walproposer_api
 	 * handled by elog().
 	 */
 	void		(*log_internal) (WalProposer *wp, int level, const char *line);
-
-	/*
-	 * Called right after the proposer was elected, but before it started
-	 * recovery and sent ProposerElected message to the safekeepers.
-	 *
-	 * Used by logical replication to update truncateLsn.
-	 */
-	void		(*after_election) (WalProposer *wp);
 } walproposer_api;
 
 /*
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 6199def43f..734e627b4d 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -101,6 +101,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
 
+static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
+
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -1211,16 +1213,38 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	}
 }
 
-/*
- * Receive WAL from most advanced safekeeper
- */
+/* Download WAL before basebackup for logical walsenders from sk, if needed */
 static bool
-WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 {
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 	char		conninfo[MAXCONNINFO];
+	TimeLineID	timeline;
+	XLogRecPtr	startpos;
+	XLogRecPtr	endpos;
+	uint64		download_range_mb;
+
+	startpos = GetLogRepRestartLSN(wp);
+	if (startpos == InvalidXLogRecPtr)
+		return true;			/* recovery not needed */
+	endpos = wp->propEpochStartLsn;
+
+	/*
+	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
+	 * avoid risk of exploding pg_wal. Logical replication won't work until
+	 * recreated, but at least compute would start; this also follows
+	 * max_slot_wal_keep_size semantics.
+	 */
+	download_range_mb = (endpos - startpos) / 1024 / 1024;
+	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
+	{
+		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
+		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
+					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
+	}
+	timeline = wp->greetRequest.timeline;
 
 	if (!neon_auth_token)
 	{
@@ -1250,7 +1274,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
 		return false;
 	}
 	elog(LOG,
-		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
 		 "%d",
 		 sk->host, sk->port, (uint32) (startpos >> 32),
 		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
@@ -1928,15 +1952,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
 }
 
-static void
-walprop_pg_after_election(WalProposer *wp)
+static XLogRecPtr
+GetLogRepRestartLSN(WalProposer *wp)
 {
 	FILE	   *f;
-	XLogRecPtr	lrRestartLsn;
+	XLogRecPtr	lrRestartLsn = InvalidXLogRecPtr;
 
 	/* We don't need to do anything in syncSafekeepers mode. */
 	if (wp->config->syncSafekeepers)
-		return;
+		return InvalidXLogRecPtr;
 
 	/*
 	 * If there are active logical replication subscription we need to provide
@@ -1944,25 +1968,40 @@ walprop_pg_after_election(WalProposer *wp)
 	 * replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
-	if (f != NULL && !wp->config->syncSafekeepers)
+	if (f != NULL)
 	{
-		size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+		size_t		rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+
 		fclose(f);
 		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			uint64		download_range_mb;
 
-			if (max_slot_wal_keep_size_mb <= 0 || lrRestartLsn + max_slot_wal_keep_size_mb*MB > wp->truncateLsn)
+			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+
+			/*
+			 * If we need to download more than a max_slot_wal_keep_size,
+			 * don't do it to avoid risk of exploding pg_wal. Logical
+			 * replication won't work until recreated, but at least compute
+			 * would start; this also follows max_slot_wal_keep_size
+			 * semantics.
+			 */
+			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
+			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 			{
-				/*
-				 * start from the beginning of the segment to fetch page headers
-				 * verifed by XLogReader
-				 */
-				lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-				wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				return InvalidXLogRecPtr;
 			}
+
+			/*
+			 * start from the beginning of the segment to fetch page headers
+			 * verifed by XLogReader
+			 */
+			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
 		}
 	}
+	return lrRestartLsn;
 }
 
 static const walproposer_api walprop_pg = {
@@ -1997,5 +2036,4 @@ static const walproposer_api walprop_pg = {
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
 	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
 	.log_internal = walprop_pg_log_internal,
-	.after_election = walprop_pg_after_election,
 };

From 854df0f566e717bb2fc640201a8c11cbd0d2d125 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Dec 2023 17:40:42 +0300
Subject: [PATCH 165/202] Do PQgetCopyData before PQconsumeInput in
 libpqwp_async_read.

To avoid a lot of redundant memmoves and bloated input buffer.

fixes https://github.com/neondatabase/neon/issues/6055
---
 pgxn/neon/walproposer_pg.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 734e627b4d..0999156431 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -820,8 +820,7 @@ walprop_flush(Safekeeper *sk)
 PGAsyncReadResult
 libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 {
-
-	int			result;
+	int			rawlen;
 
 	if (conn->recvbuf != NULL)
 	{
@@ -829,12 +828,19 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 		conn->recvbuf = NULL;
 	}
 
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
+	/* Try to receive a CopyData message */
+	rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
+	if (rawlen == 0)
 	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
+		/* Try consuming some data. */
+		if (!PQconsumeInput(conn->pg_conn))
+		{
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_FAIL;
+		}
+		/* Now that we've consumed some input, try again */
+		rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
 	}
 
 	/*
@@ -848,7 +854,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
+	switch (rawlen)
 	{
 		case 0:
 			*amount = 0;
@@ -883,7 +889,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 			return PG_ASYNC_READ_FAIL;
 		default:
 			/* Positive values indicate the size of the returned result */
-			*amount = result;
+			*amount = rawlen;
 			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}

From 1f1c50e8c7f737213bdc7c670c7ef204c52a6f9c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Dec 2023 11:25:44 +0300
Subject: [PATCH 166/202] Don't re-add neon_walreader socket to waiteventset if
 possible.

Should make recovery slightly more efficient (likely negligibly).
---
 pgxn/neon/neon_walreader.c | 11 +++++++++++
 pgxn/neon/neon_walreader.h |  1 +
 pgxn/neon/walproposer.h    | 12 +++++++++++-
 pgxn/neon/walproposer_pg.c | 32 ++++++++++++++++----------------
 4 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index f035c2928f..f7ec9e5bfa 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -556,6 +556,17 @@ NeonWALReaderSocket(NeonWALReader *state)
 	return PQsocket(state->wp_conn->pg_conn);
 }
 
+/*
+ * Whether remote connection is established. Once this is done, until successful
+ * local read or error socket is stable and user can update socket events
+ * instead of readding it each time.
+ */
+bool
+NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
+{
+	return state->rem_state == RS_ESTABLISHED;
+}
+
 /*
  * Returns events user should wait on connection socket or 0 if remote
  * connection is not active.
diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h
index 805c94fc53..6be9f149aa 100644
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -24,6 +24,7 @@ extern void NeonWALReaderFree(NeonWALReader *state);
 extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
 extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
 extern uint32 NeonWALReaderEvents(NeonWALReader *state);
+extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
 extern char *NeonWALReaderErrMsg(NeonWALReader *state);
 
 #endif							/* __NEON_WALREADER_H__ */
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 2b2c252a18..4c2b53a1ef 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -377,9 +377,19 @@ typedef struct Safekeeper
 	int			eventPos;
 
 	/*
-	 * Neon WAL reader position in wait event set, or -1 if no socket.
+	 * Neon WAL reader position in wait event set, or -1 if no socket. Note
+	 * that event must be removed not only on error/failure, but also on
+	 * successful *local* read, as next read might again be remote, but with
+	 * different socket.
 	 */
 	int			nwrEventPos;
+
+	/*
+	 * Per libpq docs, during connection establishment socket might change,
+	 * remember here if it is stable to avoid readding to the event set if
+	 * possible. Must be reset whenever nwr event is deleted.
+	 */
+	bool		nwrConnEstablished;
 #endif
 
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 0999156431..57be2d8d96 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1541,6 +1541,7 @@ walprop_pg_free_event_set(WalProposer *wp)
 	{
 		wp->safekeeper[i].eventPos = -1;
 		wp->safekeeper[i].nwrEventPos = -1;
+		wp->safekeeper[i].nwrConnEstablished = false;
 	}
 }
 
@@ -1561,6 +1562,7 @@ walprop_pg_init_event_set(WalProposer *wp)
 	{
 		wp->safekeeper[i].eventPos = -1;
 		wp->safekeeper[i].nwrEventPos = -1;
+		wp->safekeeper[i].nwrConnEstablished = false;
 	}
 }
 
@@ -1578,6 +1580,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
 {
 	Assert(sk->nwrEventPos == -1);
 	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
+	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
 	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
@@ -1619,14 +1622,19 @@ walprop_pg_active_state_update_event_set(Safekeeper *sk)
 	if (sk->active_state == SS_ACTIVE_READ_WAL)
 	{
 		/*
-		 * TODO: instead of reattaching socket (and thus recreating WES) each
-		 * time we should keep it if possible, i.e. if connection is already
-		 * established. Note that single neon_walreader object can switch
-		 * between local and remote reads multiple times during its lifetime,
-		 * so careful bookkeeping is needed here.
+		 * If conn is established and socket is thus stable, update the event
+		 * directly; otherwise re-add it.
 		 */
-		rm_safekeeper_event_set(sk, false);
-		add_nwr_event_set(sk, nwr_events);
+		if (sk->nwrConnEstablished)
+		{
+			Assert(sk->nwrEventPos != -1);
+			update_nwr_event_set(sk, nwr_events);
+		}
+		else
+		{
+			rm_safekeeper_event_set(sk, false);
+			add_nwr_event_set(sk, nwr_events);
+		}
 	}
 	else
 	{
@@ -1701,14 +1709,6 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 	{
 		Safekeeper *sk = &wp->safekeeper[i];
 
-		if (sk == to_remove)
-		{
-			if (is_sk)
-				sk->eventPos = -1;
-			else
-				sk->nwrEventPos = -1;
-		}
-
 		/*
 		 * If this safekeeper isn't offline, add events for it, except for the
 		 * event requested to remove.
@@ -1725,7 +1725,7 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 				/* will set sk->eventPos */
 				wp->api.add_safekeeper_event_set(sk, sk_events);
 			}
-			else if ((sk != to_remove || is_sk) && nwr_events)
+			if ((sk != to_remove || is_sk) && nwr_events)
 			{
 				add_nwr_event_set(sk, nwr_events);
 			}

From d5fbfe2399cc85f461fc6c3b3a32077d0b9ebd73 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Dec 2023 16:02:42 +0300
Subject: [PATCH 167/202] Remove test_wal_deleted_after_broadcast.

It is superseded by stronger test_lagging_sk.
---
 test_runner/regress/test_wal_acceptor.py | 54 ------------------------
 1 file changed, 54 deletions(-)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5a0856c69c..cf8df389c8 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1557,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
-# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted
-# to all safekeepers. This test checks that compute WAL can fit into small number
-# of WAL segments.
-def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder):
-    # used to calculate delta in collect_stats
-    last_lsn = Lsn(0)
-
-    # returns pg_wal size in MB
-    def collect_stats(endpoint: Endpoint, cur, enable_logs=True):
-        nonlocal last_lsn
-        assert endpoint.pgdata_dir is not None
-
-        log.info("executing INSERT to generate WAL")
-        current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024
-        if enable_logs:
-            lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024
-            log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB")
-        last_lsn = current_lsn
-        return pg_wal_size_mb
-
-    # generates about ~20MB of WAL, to create at least one new segment
-    def generate_wal(cur):
-        cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'")
-
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch("test_wal_deleted_after_broadcast")
-    # Adjust checkpoint config to prevent keeping old WAL segments
-    endpoint = env.endpoints.create_start(
-        "test_wal_deleted_after_broadcast",
-        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-    cur.execute("CREATE TABLE t(key int, value text)")
-
-    collect_stats(endpoint, cur)
-
-    # generate WAL to simulate normal workload
-    for _ in range(5):
-        generate_wal(cur)
-        collect_stats(endpoint, cur)
-
-    log.info("executing checkpoint")
-    cur.execute("CHECKPOINT")
-    wal_size_after_checkpoint = collect_stats(endpoint, cur)
-
-    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
-    assert wal_size_after_checkpoint < 16 * 2.5
-
-
 @pytest.mark.parametrize("auth_enabled", [False, True])
 def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     neon_env_builder.auth_enabled = auth_enabled

From bfc98f36e34467c271afe851bc23e90b95d0ead6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 16 Dec 2023 00:00:49 +0300
Subject: [PATCH 168/202] Refactor handling responses in walproposer.

Remove confirm_wal_streamed; we already apply both write and flush positions of
the slot to commit_lsn which is fine because 1) we need to wake up waiters 2)
committed WAL can be fetched from safekeepers by neon_walreader now.
---
 libs/walproposer/src/api_bindings.rs |  9 ---
 libs/walproposer/src/walproposer.rs  |  4 --
 pgxn/neon/walproposer.c              | 33 ++++-------
 pgxn/neon/walproposer.h              |  6 --
 pgxn/neon/walproposer_pg.c           | 85 ++++++++++++++--------------
 5 files changed, 56 insertions(+), 81 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index e884f8438a..1f7bf952dc 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -326,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
     }
 }
 
-extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).confirm_wal_streamed(&mut (*wp), lsn)
-    }
-}
-
 extern "C" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
@@ -419,7 +411,6 @@ pub(crate) fn create_api() -> walproposer_api {
         get_redo_start_lsn: Some(get_redo_start_lsn),
         finish_sync_safekeepers: Some(finish_sync_safekeepers),
         process_safekeeper_feedback: Some(process_safekeeper_feedback),
-        confirm_wal_streamed: Some(confirm_wal_streamed),
         log_internal: Some(log_internal),
     }
 }
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 87001c9c66..35c8f6904d 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,10 +142,6 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
-        todo!()
-    }
-
     fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
         todo!()
     }
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 5874d199f9..7fb0cab9a0 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1643,35 +1643,26 @@ static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
 	XLogRecPtr	minQuorumLsn;
-	XLogRecPtr	minFlushLsn;
+	XLogRecPtr	candidateTruncateLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
 
 	/*
-	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the
-	 * beginning of the record, which simplifies decoding on the far end.
+	 * Try to advance truncateLsn -- the last record flushed to all
+	 * safekeepers.
 	 *
-	 * Advanced truncateLsn should be not further than nearest commitLsn. This
-	 * prevents surprising violation of truncateLsn <= commitLsn invariant
-	 * which might occur because 1) truncateLsn can be advanced immediately
-	 * once chunk is broadcast to all safekeepers, and commitLsn generally
-	 * can't be advanced based on feedback from safekeeper who is still in the
-	 * previous epoch (similar to 'leader can't commit entries from previous
-	 * term' in Raft); 2) chunks we read from WAL and send are plain sheets of
-	 * bytes, but safekeepers ack only on record boundaries.
+	 * Advanced truncateLsn should be not higher than commitLsn. This prevents
+	 * surprising violation of truncateLsn <= commitLsn invariant which might
+	 * occur because commitLsn generally can't be advanced based on feedback
+	 * from safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2)
 	 */
-	minFlushLsn = CalculateMinFlushLsn(wp);
-	if (minFlushLsn > wp->truncateLsn)
+	candidateTruncateLsn = CalculateMinFlushLsn(wp);
+	candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
+	if (candidateTruncateLsn > wp->truncateLsn)
 	{
-		wp->truncateLsn = minFlushLsn;
-
-		/*
-		 * Advance the replication slot to free up old WAL files. Note that
-		 * slot doesn't exist if we are in syncSafekeepers mode.
-		 */
-		wp->api.confirm_wal_streamed(wp, wp->truncateLsn);
+		wp->truncateLsn = candidateTruncateLsn;
 	}
 
 	/*
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 4c2b53a1ef..6d478076fe 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -557,12 +557,6 @@ typedef struct walproposer_api
 	 */
 	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
 
-	/*
-	 * Called on peer_horizon_lsn updates. Used to advance replication slot
-	 * and to free up disk space by deleting unnecessary WAL.
-	 */
-	void		(*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
-
 	/*
 	 * Write a log message to the internal log processor. This is used only
 	 * when walproposer is compiled as a library. Otherwise, all logging is
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 57be2d8d96..10c740840f 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1812,7 +1812,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 }
 
 /*
- * Get PageserverFeedback fields from the most advanced safekeeper
+ * Choose most advanced PageserverFeedback and set it to *rf.
  */
 static void
 GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
@@ -1842,8 +1842,6 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
 		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
 		 rf->replytime);
-
-	replication_feedback_set(rf);
 }
 
 /*
@@ -1883,63 +1881,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
+/*
+ * Based on commitLsn and safekeeper responses including pageserver feedback,
+ * 1) Propagate cluster size received from ps to ensure the limit.
+ * 2) Propagate pageserver LSN positions to ensure backpressure limits.
+ * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters).
+ * 4) Propagate hot standby feedback.
+ *
+ * None of that is functional in sync-safekeepers.
+ */
 static void
 walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 {
 	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	diskConsistentLsn;
+	XLogRecPtr	oldDiskConsistentLsn;
 
-	diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
+	if (wp->config->syncSafekeepers)
+		return;
 
-	if (!wp->config->syncSafekeepers)
+	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
+
+	/* Get PageserverFeedback fields from the most advanced safekeeper */
+	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
+	replication_feedback_set(&quorumFeedback.rf);
+	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
+
+	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
-		/* Get PageserverFeedback fields from the most advanced safekeeper */
-		GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-	}
-
-	if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
-	{
-
 		if (commitLsn > quorumFeedback.flushLsn)
 			quorumFeedback.flushLsn = commitLsn;
 
-		/* advance the replication slot */
-		if (!wp->config->syncSafekeepers)
-			ProcessStandbyReply(
-			/* write_lsn -  This is what durably stored in WAL service. */
-								quorumFeedback.flushLsn,
-			/* flush_lsn - This is what durably stored in WAL service. */
-								quorumFeedback.flushLsn,
+		/*
+		 * Advance the replication slot to commitLsn. WAL before it is
+		 * hardened and will be fetched from one of safekeepers by
+		 * neon_walreader if needed.
+		 *
+		 * Also wakes up syncrep waiters.
+		 */
+		ProcessStandbyReply(
+		/* write_lsn -  This is what durably stored in WAL service. */
+							quorumFeedback.flushLsn,
+		/* flush_lsn - This is what durably stored in WAL service. */
+							quorumFeedback.flushLsn,
 
-			/*
-			 * apply_lsn - This is what processed and durably saved at*
-			 * pageserver.
-			 */
-								quorumFeedback.rf.disk_consistent_lsn,
-								walprop_pg_get_current_timestamp(wp), false);
+		/*
+		 * apply_lsn - This is what processed and durably saved at*
+		 * pageserver.
+		 */
+							quorumFeedback.rf.disk_consistent_lsn,
+							walprop_pg_get_current_timestamp(wp), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
 	{
 		quorumFeedback.hs = hsFeedback;
-		if (!wp->config->syncSafekeepers)
-			ProcessStandbyHSFeedback(hsFeedback.ts,
-									 XidFromFullTransactionId(hsFeedback.xmin),
-									 EpochFromFullTransactionId(hsFeedback.xmin),
-									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+		ProcessStandbyHSFeedback(hsFeedback.ts,
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 }
 
-static void
-walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
-{
-	if (MyReplicationSlot)
-		PhysicalConfirmReceivedLocation(lsn);
-}
-
 static XLogRecPtr
 walprop_pg_get_redo_start_lsn(WalProposer *wp)
 {
@@ -2040,6 +2044,5 @@ static const walproposer_api walprop_pg = {
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
-	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
 	.log_internal = walprop_pg_log_internal,
 };

From ddc431fc8f5cd48073fad5f1f1246cdc198e6954 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 26 Dec 2023 12:03:42 +0300
Subject: [PATCH 169/202] pgindent walproposer condvar comment

---
 pgxn/neon/walproposer_pg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 10c740840f..7773aabfab 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1748,8 +1748,8 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 		ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
 
 	/*
-	 * Now that we prepared the condvar, check flush ptr again -- it might have
-	 * changed before we subscribed to cv so we missed the wakeup.
+	 * Now that we prepared the condvar, check flush ptr again -- it might
+	 * have changed before we subscribed to cv so we missed the wakeup.
 	 *
 	 * Do that only when we're interested in new WAL: without sync-safekeepers
 	 * and if election already passed.

From 6e40900569df5c09763034198990560bc1eee6aa Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 14 Dec 2023 15:08:14 +0000
Subject: [PATCH 170/202] Manage pgbouncer configuration from compute_ctl: -
 add pgbouncer_settings section to compute spec; - add pgbouncer-connstr
 option to compute_ctl. - add pgbouncer-ini-path option to compute_ctl.
 Default: /etc/pgbouncer/pgbouncer.ini

Apply pgbouncer config on compute start and respec to override default spec.

Save pgbouncer config updates to pgbouncer.ini to preserve them across pgbouncer restarts.
---
 Cargo.lock                               | 67 +++++++++++++++++++++++
 compute_tools/Cargo.toml                 |  1 +
 compute_tools/src/bin/compute_ctl.rs     | 26 ++++++++-
 compute_tools/src/compute.rs             | 56 +++++++++++++++++++
 compute_tools/src/pg_helpers.rs          | 69 +++++++++++++++++++++++-
 control_plane/src/endpoint.rs            |  1 +
 deny.toml                                |  1 +
 libs/compute_api/src/spec.rs             |  2 +
 libs/compute_api/tests/cluster_spec.json |  4 ++
 vm-image-spec.yaml                       |  1 +
 workspace_hack/Cargo.toml                |  4 ++
 11 files changed, 230 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0be6d5d183..abd87dc0da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1168,6 +1168,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
+ "rust-ini",
  "serde",
  "serde_json",
  "tar",
@@ -1201,6 +1202,26 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
 
+[[package]]
+name = "const-random"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom 0.2.11",
+ "once_cell",
+ "tiny-keccak",
+]
+
 [[package]]
 name = "const_fn"
 version = "0.4.9"
@@ -1433,6 +1454,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
 [[package]]
 name = "crypto-bigint"
 version = "0.4.9"
@@ -1575,6 +1602,15 @@ dependencies = [
  "syn 2.0.32",
 ]
 
+[[package]]
+name = "dlv-list"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
+dependencies = [
+ "const-random",
+]
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -3043,6 +3079,16 @@ dependencies = [
  "tokio-stream",
 ]
 
+[[package]]
+name = "ordered-multimap"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
+dependencies = [
+ "dlv-list",
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "os_info"
 version = "3.7.0"
@@ -4216,6 +4262,16 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "rust-ini"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
+dependencies = [
+ "cfg-if",
+ "ordered-multimap",
+]
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.23"
@@ -5170,6 +5226,15 @@ dependencies = [
  "time-core",
 ]
 
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -6337,6 +6402,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
+ "getrandom 0.2.11",
  "hex",
  "hmac",
  "hyper",
@@ -6348,6 +6414,7 @@ dependencies = [
  "num-bigint",
  "num-integer",
  "num-traits",
+ "once_cell",
  "prost",
  "rand 0.8.5",
  "regex",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 18b30810b0..142fa08495 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -39,3 +39,4 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
+rust-ini = "0.20.0"
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index ce7345d5be..436db59088 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,9 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r http://pg-ext-s3-gateway
+//!             -r http://pg-ext-s3-gateway \
+//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
+//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
 //! ```
 //!
 use std::collections::HashMap;
@@ -99,6 +101,9 @@ fn main() -> Result<()> {
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
 
+    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
+    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
+
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
     // tracing context.
@@ -209,6 +214,8 @@ fn main() -> Result<()> {
         ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
         ext_download_progress: RwLock::new(HashMap::new()),
         build_tag,
+        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
+        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
     };
     let compute = Arc::new(compute_node);
 
@@ -493,6 +500,23 @@ fn cli() -> clap::Command {
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
+        .arg(
+            Arg::new("pgbouncer-connstr")
+                .long("pgbouncer-connstr")
+                .default_value(
+                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
+                )
+                .value_name("PGBOUNCER_CONNSTR"),
+        )
+        .arg(
+            Arg::new("pgbouncer-ini-path")
+                .long("pgbouncer-ini-path")
+                // Note: this doesn't match current path for pgbouncer.ini.
+                // Until we fix it, we need to pass the path explicitly
+                // or this will be effectively no-op.
+                .default_value("/etc/pgbouncer.ini")
+                .value_name("PGBOUNCER_INI_PATH"),
+        )
 }
 
 #[test]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index b39a800f14..cd7be0520e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -7,6 +7,7 @@ use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
 use std::sync::{Condvar, Mutex, RwLock};
+use std::thread;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
@@ -64,6 +65,10 @@ pub struct ComputeNode {
     // key: ext_archive_name, value: started download time, download_completed?
     pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
     pub build_tag: String,
+    // connection string to pgbouncer to change settings
+    pub pgbouncer_connstr: Option<String>,
+    // path to pgbouncer.ini to change settings
+    pub pgbouncer_ini_path: Option<String>,
 }
 
 // store some metrics about download size that might impact startup time
@@ -737,6 +742,31 @@ impl ComputeNode {
     pub fn reconfigure(&self) -> Result<()> {
         let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
 
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);
+
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("failed to create rt");
+
+            // Spawn a thread to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let _handle = thread::spawn(move || {
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                }
+            });
+        }
+
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -791,6 +821,32 @@ impl ComputeNode {
             pspec.timeline_id,
         );
 
+        // tune pgbouncer
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);
+
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("failed to create rt");
+
+            // Spawn a thread to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let _handle = thread::spawn(move || {
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                }
+            });
+        }
+
         info!(
             "start_compute spec.remote_extensions {:?}",
             pspec.spec.remote_extensions
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index b79e516650..0b0e137c03 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -9,9 +9,11 @@ use std::process::Child;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Result};
+use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tracing::{debug, instrument};
+use tokio_postgres::NoTls;
+use tracing::{debug, error, info, instrument};
 
 use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
@@ -359,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
 
     Ok(())
 }
+
+/// Update pgbouncer.ini with provided options
+pub fn update_pgbouncer_ini(
+    pgbouncer_config: HashMap<String, String>,
+    pgbouncer_ini_path: &str,
+) -> Result<()> {
+    let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
+    let section = conf.section_mut(Some("pgbouncer")).unwrap();
+
+    for (option_name, value) in pgbouncer_config.iter() {
+        section.insert(option_name, value);
+    }
+
+    conf.write_to_file(pgbouncer_ini_path)?;
+    Ok(())
+}
+
+/// Tune pgbouncer.
+/// 1. Apply new config using pgbouncer admin console
+/// 2. Add new values to pgbouncer.ini to preserve them after restart
+pub async fn tune_pgbouncer(
+    pgbouncer_settings: Option<HashMap<String, String>>,
+    pgbouncer_connstr: &str,
+    pgbouncer_ini_path: Option<String>,
+) -> Result<()> {
+    if let Some(pgbouncer_config) = pgbouncer_settings {
+        // Apply new config
+        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
+        let (client, connection) = connect_result.unwrap();
+        tokio::spawn(async move {
+            if let Err(e) = connection.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+
+        for (option_name, value) in pgbouncer_config.iter() {
+            info!(
+                "Applying pgbouncer setting change: {} = {}",
+                option_name, value
+            );
+            let query = format!("SET {} = {}", option_name, value);
+
+            let result = client.simple_query(&query).await;
+
+            info!("Applying pgbouncer setting change: {}", query);
+            info!("pgbouncer setting change result: {:?}", result);
+
+            if let Err(err) = result {
+                // Don't fail on error, just print it into log
+                error!(
+                    "Failed to apply pgbouncer setting change: {},  {}",
+                    query, err
+                );
+            };
+        }
+
+        // save values to pgbouncer.ini
+        // so that they are preserved after pgbouncer restart
+        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
+            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 071f22dc2b..55b66742ca 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -537,6 +537,7 @@ impl Endpoint {
             safekeeper_connstrings,
             storage_auth_token: auth_token.clone(),
             remote_extensions,
+            pgbouncer_settings: None,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/deny.toml b/deny.toml
index 079dcac679..22e39a2ca3 100644
--- a/deny.toml
+++ b/deny.toml
@@ -35,6 +35,7 @@ allow = [
     "Artistic-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
+    "CC0-1.0",
     "ISC",
     "MIT",
     "MPL-2.0",
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2a483188e4..4ff6831272 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -73,6 +73,8 @@ pub struct ComputeSpec {
 
     // information about available remote extensions
     pub remote_extensions: Option<RemoteExtSpec>,
+
+    pub pgbouncer_settings: Option<HashMap<String, String>>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json
index e2afa17ef0..ccd015ad19 100644
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -243,5 +243,9 @@
         "public_extensions": [
           "postgis"
         ]
+      },
+      "pgbouncer_settings": {
+        "default_pool_size": "42",
+        "pool_mode": "session"
       }
 }
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 804405293f..68be0b3617 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -36,6 +36,7 @@ files:
       max_client_conn=10000
       default_pool_size=64
       max_prepared_statements=0
+      admin_users=cloud_admin
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 82bbedc4ae..4f13064088 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,6 +39,7 @@ futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -50,6 +51,7 @@ nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128"] }
+once_cell = { version = "1" }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -84,11 +86,13 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+once_cell = { version = "1" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From 136aab54793816ac86a386084ed858f522d334c5 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 26 Dec 2023 14:37:09 -0800
Subject: [PATCH 171/202] Bump postgres submodule versions

---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 0bb356aa0c..03358bb0b5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d
+Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 24333abb81..a2dc225ddf 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7
+Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 863b71572b..225071f482 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb
+Subproject commit 225071f482774943854c2eec4540757e01171557
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a9575a2cb7..def4eab069 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb",
-    "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
-    "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
+    "postgres-v16": "225071f482774943854c2eec4540757e01171557",
+    "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c",
+    "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8"
 }

From e5a3b6dfd8e7b2c7f72902e33868eddf72713630 Mon Sep 17 00:00:00 2001
From: Bodobolero <peterbendel@neon.tech>
Date: Wed, 27 Dec 2023 18:15:17 +0100
Subject: [PATCH 172/202] Pg stat statements reset for neon superuser (#6232)

## Problem

Extension pg_stat_statements has function pg_stat_statements_reset().
In vanilla Postgres this function can only be called by superuser role
or other users/roles explicitly granted.
In Neon no end user can use superuser role.
Instead we have neon_superuser role.
We need to grant execute on pg_stat_statements_reset() to neon_superuser

## Summary of changes

Modify the Postgres v14, v15, v16 contrib in our compute docker file to
grant execute on pg_stat_statements_reset() to neon_superuser.
(Modifying it in our docker file is preferable to changes in
neondatabase/postgres because we want to limit the changes in our fork
that we have to carry with each new version of Postgres).

Note that the interface of proc/function pg_stat_statements_reset
changed in pg_stat_statements version 1.7

So for versions up to and including 1.6 we must

`GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO
neon_superuser;`

and for versions starting from 1.7 we must

`GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO
neon_superuser;`

If we just use `GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO
neon_superuser;` for all version this results in the following error for
versions 1.7+:

```sql
neondb=> create extension pg_stat_statements;
ERROR:  function pg_stat_statements_reset() does not exist
```



## Checklist before requesting a review

- [x ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

## I have run the following test and could now invoke
pg_stat_statements_reset() using default user

```bash
(neon) peterbendel@Peters-MBP neon % kubectl get pods | grep compute-quiet-mud-88416983
compute-quiet-mud-88416983-74f4bf67db-crl4c            3/3     Running     0          7m26s
(neon) peterbendel@Peters-MBP neon % kubectl set image deploy/compute-quiet-mud-88416983 compute-node=neondatabase/compute-node-v15:7307610371
deployment.apps/compute-quiet-mud-88416983 image updated
(neon) peterbendel@Peters-MBP neon % psql postgresql://peterbendel:<secret>@ep-bitter-sunset-73589702.us-east-2.aws.neon.build/neondb
psql (16.1, server 15.5)
SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, compression: off)
Type "help" for help.

neondb=> select version();
                                              version
---------------------------------------------------------------------------------------------------
 PostgreSQL 15.5 on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
(1 row)

neondb=> create extension pg_stat_statements;
CREATE EXTENSION

neondb=> select pg_stat_statements_reset();
 pg_stat_statements_reset
--------------------------

(1 row)
```
---
 Dockerfile.compute-node | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 8db60ff85f..14ba1b5b9a 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -48,7 +48,29 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
+    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
+    # In vanilla postgres this function is limited to Postgres role superuser.
+    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
+    # so we do it here.
+    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
+    # the first loop is for pg_stat_statement extension version <= 1.6
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if echo "$old_list" | grep -q -F "$filename"; then \
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
+        fi; \
+    done; \
+    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # where pg_stat_statement_reset() got 3 additional arguments
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if ! echo "$old_list" | grep -q -F "$filename"; then \
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
+        fi; \
+    done      
 
 #########################################################################################
 #

From 1c037209c775f0330c2ffc7c5c1826487c75b0e1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 29 Dec 2023 09:32:24 +0000
Subject: [PATCH 173/202] proxy: fix compute addr parsing (#6237)

## Problem

control plane should be able to return domain names and not just IP
addresses.

## Summary of changes

1. add regression tests
2. use rsplit to split the port from the back, then trim the ipv6
brackets
---
 proxy/src/console/provider/neon.rs | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 628d98df49..5bf7b0f986 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -11,7 +11,7 @@ use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use std::{net::SocketAddr, sync::Arc};
+use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -141,7 +141,7 @@ impl Api {
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
             let mut config = compute::ConnCfg::new();
-            config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
@@ -269,9 +269,10 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     Err(ApiError::Console { status, text })
 }
 
-fn parse_host_port(input: &str) -> Option<(String, u16)> {
-    let parsed: SocketAddr = input.parse().ok()?;
-    Some((parsed.ip().to_string(), parsed.port()))
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
 }
 
 #[cfg(test)]
@@ -279,9 +280,24 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_parse_host_port() {
+    fn test_parse_host_port_v4() {
         let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
         assert_eq!(host, "127.0.0.1");
         assert_eq!(port, 5432);
     }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
 }

From f28bdb652854200e97febe9eb601a1dc2534266a Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Sat, 30 Dec 2023 13:45:31 +0000
Subject: [PATCH 174/202] Use nextest for rust unittests (#6223)

## Problem
`cargo test` doesn't support timeouts
or junit output format

## Summary of changes
- Add `nextest` to `build-tools` image
- Switch `cargo test` with `cargo nextest` on CI
- Set timeout
---
 .config/nextest.toml                 | 2 ++
 .github/workflows/build_and_test.yml | 8 ++++----
 Dockerfile.buildtools                | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)
 create mode 100644 .config/nextest.toml

diff --git a/.config/nextest.toml b/.config/nextest.toml
new file mode 100644
index 0000000000..8bccd51c6d
--- /dev/null
+++ b/.config/nextest.toml
@@ -0,0 +1,2 @@
+[profile.default]
+slow-timeout = "1m"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3091ce6d3a..78deff6e85 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -339,16 +339,16 @@ jobs:
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
-      - name: Run cargo test
+      - name: Run rust tests
         run: |
-          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -358,7 +358,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
       - name: Install rust binaries
         run: |
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index 77722f173b..c2fcd8841e 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -151,6 +151,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     cargo install cargo-hakari && \
     cargo install cargo-deny && \
     cargo install cargo-hack && \
+    cargo install cargo-nextest && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 ENV RUSTC_WRAPPER=cachepot

From 9a43c04a19c8577466dd6e992991a2aaeec6e556 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 14:38:08 +0300
Subject: [PATCH 175/202] compute_ctl: kill postgres and sync-safekeeprs on
 exit.

Otherwise they are left orphaned when compute_ctl is terminated with a
signal. It was invisible most of the time because normally neon_local or k8s
kills postgres directly and then compute_ctl finishes gracefully. However, in
some tests compute_ctl gets stuck waiting for sync-safekeepers which
intentionally never ends because safekeepers are offline, and we want to stop
compute_ctl without leaving orphanes behind.

This is a quite rough approach which doesn't wait for children termination. A
better way would be to convert compute_ctl to async which would make waiting
easy.
---
 Cargo.lock                           |  2 ++
 compute_tools/Cargo.toml             |  2 ++
 compute_tools/src/bin/compute_ctl.rs | 32 +++++++++++++++++++++++++++-
 compute_tools/src/compute.rs         |  8 +++++++
 control_plane/src/endpoint.rs        | 18 ++++++++++++----
 5 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index abd87dc0da..8e0ad7c8ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1161,6 +1161,7 @@ dependencies = [
  "flate2",
  "futures",
  "hyper",
+ "nix 0.26.2",
  "notify",
  "num_cpus",
  "opentelemetry",
@@ -1171,6 +1172,7 @@ dependencies = [
  "rust-ini",
  "serde",
  "serde_json",
+ "signal-hook",
  "tar",
  "tokio",
  "tokio-postgres",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 142fa08495..759a117ee9 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -13,6 +13,7 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
+nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
@@ -20,6 +21,7 @@ postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+signal-hook.workspace = true
 tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 436db59088..eb1d746f04 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,18 +40,22 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
+use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use nix::sys::signal::{kill, Signal};
+use signal_hook::consts::{SIGQUIT, SIGTERM};
+use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -67,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
+    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
+    thread::spawn(move || {
+        for sig in signals.forever() {
+            handle_exit_signal(sig);
+        }
+    });
+
     let build_tag = option_env!("BUILD_TAG")
         .unwrap_or(BUILD_TAG_DEFAULT)
         .to_string();
@@ -346,6 +357,7 @@ fn main() -> Result<()> {
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
+        PG_PID.store(0, Ordering::SeqCst);
         info!("Postgres exited with code {}, shutting down", ecode);
         exit_code = ecode.code()
     }
@@ -519,6 +531,24 @@ fn cli() -> clap::Command {
         )
 }
 
+/// When compute_ctl is killed, send also termination signal to sync-safekeepers
+/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
+/// wait for termination which would be easy then.
+fn handle_exit_signal(sig: i32) {
+    info!("received {sig} termination signal");
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        kill(pg_pid, Signal::SIGTERM).ok();
+    }
+    exit(1);
+}
+
 #[test]
 fn verify_cli() {
     cli().debug_assert()
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index cd7be0520e..13701b7378 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -6,6 +6,8 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
+use std::sync::atomic::AtomicU32;
+use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
 use std::thread;
 use std::time::Instant;
@@ -34,6 +36,9 @@ use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};
 
+pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
+pub static PG_PID: AtomicU32 = AtomicU32::new(0);
+
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
     // Url type maintains proper escaping
@@ -501,6 +506,7 @@ impl ComputeNode {
             .stdout(Stdio::piped())
             .spawn()
             .expect("postgres --sync-safekeepers failed to start");
+        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
 
         // `postgres --sync-safekeepers` will print all log output to stderr and
         // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
@@ -508,6 +514,7 @@ impl ComputeNode {
         let sync_output = sync_handle
             .wait_with_output()
             .expect("postgres --sync-safekeepers failed");
+        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
 
         if !sync_output.status.success() {
             anyhow::bail!(
@@ -662,6 +669,7 @@ impl ComputeNode {
             })
             .spawn()
             .expect("cannot start postgres process");
+        PG_PID.store(pg.id(), Ordering::SeqCst);
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 55b66742ca..3d5dfd6311 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,8 @@ use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
+use nix::sys::signal::kill;
+use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -439,11 +441,14 @@ impl Endpoint {
         Ok(())
     }
 
-    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
+    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
         // TODO use background_process::stop_process instead
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
+        if send_sigterm {
+            kill(pid, Signal::SIGTERM).ok();
+        }
         crate::background_process::wait_until_stopped("compute_ctl", pid)?;
         Ok(())
     }
@@ -733,10 +738,15 @@ impl Endpoint {
             &None,
         )?;
 
-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
+        // Also wait for the compute_ctl process to die. It might have some
+        // cleanup work to do after postgres stops, like syncing safekeepers,
+        // etc.
         //
-        self.wait_for_compute_ctl_to_exit()?;
+        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
+        // want this cleanup: tests intentionally do stop when majority of
+        // safekeepers is down, so sync-safekeepers would hang otherwise. This
+        // could be a separate flag though.
+        self.wait_for_compute_ctl_to_exit(destroy)?;
         if destroy {
             println!(
                 "Destroying postgres data directory '{}'",

From 90ef48aab8d66e1fc3ad4a8cb187f13b8426bbb0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 14:43:44 +0300
Subject: [PATCH 176/202] Fix safekeeper START_REPLICATION (term=n).

It was giving WAL only up to commit_lsn instead of flush_lsn, so recovery of
uncommitted WAL since cdb08f03 hanged. Add test for this.
---
 safekeeper/src/send_wal.rs                    | 11 +----
 .../regress/test_wal_acceptor_async.py        | 40 +++++++++++++++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 44f14f8c7e..70590a0f95 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -391,15 +391,8 @@ impl SafekeeperPostgresHandler {
         // application_name: give only committed WAL (used by pageserver) or all
         // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
         // The second case is always driven by a consensus leader which term
-        // must generally be also supplied. However we're sloppy to do this in
-        // walproposer recovery which will be removed soon. So TODO is to make
-        // it not Option'al then.
-        //
-        // Fetching WAL without term in recovery creates a small risk of this
-        // WAL getting concurrently garbaged if another compute rises which
-        // collects majority and starts fixing log on this safekeeper itself.
-        // That's ok as (old) proposer will never be able to commit such WAL.
-        let end_watch = if self.is_walproposer_recovery() {
+        // must be supplied.
+        let end_watch = if term.is_some() {
             EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
         } else {
             EndWatch::Commit(tli.get_commit_lsn_watch_rx())
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index feab7e605b..77d67cd63a 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -475,6 +475,46 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_unavailability(env, endpoint))
 
 
+async def run_recovery_uncommitted(env: NeonEnv):
+    (sk1, sk2, _) = env.safekeepers
+
+    env.neon_cli.create_branch("test_recovery_uncommitted")
+    ep = env.endpoints.create_start("test_recovery_uncommitted")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    # insert with only one safekeeper up to create tail of flushed but not committed WAL
+    sk1.stop()
+    sk2.stop()
+    conn = await ep.connect_async()
+    # query should hang, so execute in separate task
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
+    )
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # it must still be not finished
+    assert not bg_query.done()
+    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
+    ep.stop_and_destroy()
+
+    # Start one of sks to make quorum online plus compute and ensure they can
+    # sync.
+    sk2.start()
+    ep = env.endpoints.create_start(
+        "test_recovery_uncommitted",
+    )
+    ep.safe_psql("insert into t select generate_series(1, 2000), 'payload'")
+
+
+# Test pulling uncommitted WAL (up to flush_lsn) during recovery.
+def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_recovery_uncommitted(env))
+
+
 @dataclass
 class RaceConditionTest:
     iteration: int

From dbd36e40dcc60fc9ced780c4ca0161a9c85fdc06 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 22:33:27 +0300
Subject: [PATCH 177/202] Move failpoint support code to utils.

To enable them in safekeeper as well.
---
 Cargo.lock                                    |  1 +
 libs/pageserver_api/src/models.rs             | 13 -----
 libs/utils/Cargo.toml                         |  7 +++
 .../utils}/src/failpoint_support.rs           | 57 ++++++++++++++++++-
 libs/utils/src/lib.rs                         |  2 +
 pageserver/src/bin/pageserver.rs              |  3 +-
 pageserver/src/http/routes.rs                 | 32 +----------
 pageserver/src/lib.rs                         |  2 -
 pageserver/src/tenant.rs                      |  9 ++-
 pageserver/src/walingest.rs                   |  5 +-
 10 files changed, 74 insertions(+), 57 deletions(-)
 rename {pageserver => libs/utils}/src/failpoint_support.rs (61%)

diff --git a/Cargo.lock b/Cargo.lock
index 8e0ad7c8ee..73cb83d3a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5880,6 +5880,7 @@ dependencies = [
  "chrono",
  "const_format",
  "criterion",
+ "fail",
  "futures",
  "heapless",
  "hex",
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index be41b610b8..dea925b468 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState {
     ShutDown,
 }
 
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index af0414daa2..706b7a3187 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,6 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
+# which adds some runtime cost to run tests on outage conditions
+testing = ["fail/failpoints"]
+
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
+fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
 nix.workspace = true
diff --git a/pageserver/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
similarity index 61%
rename from pageserver/src/failpoint_support.rs
rename to libs/utils/src/failpoint_support.rs
index 2190eba18a..5ec532e2a6 100644
--- a/pageserver/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -1,3 +1,14 @@
+//! Failpoint support code shared between pageserver and safekeepers.
+
+use crate::http::{
+    error::ApiError,
+    json::{json_request, json_response},
+};
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
@@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
 // Helper function used by the macro. (A function has nicer scoping so we
 // don't need to decorate everything with "::")
 #[doc(hidden)]
-pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
     let millis = duration_str.parse::<u64>().unwrap();
     let d = std::time::Duration::from_millis(millis);
 
@@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> {
     scenario
 }
 
-pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
     if actions == "exit" {
         fail::cfg_callback(name, exit_failpoint)
     } else {
@@ -84,3 +95,45 @@ fn exit_failpoint() {
     tracing::info!("Exit requested by failpoint");
     std::process::exit(1);
 }
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
+/// Configure failpoints through http.
+pub async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Cannot manage failpoints because storage was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index bb6c848bf4..9e9b0adfe5 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -83,6 +83,8 @@ pub mod timeout;
 
 pub mod sync;
 
+pub mod failpoint_support;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f65c4f4580..621ad050f4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,6 +31,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
@@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> {
     }
 
     // Initialize up failpoints support
-    let scenario = pageserver::failpoint_support::init();
+    let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
     virtual_file::init(conf.max_file_descriptors);
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 11a3a2c872..157e6b4e3e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -66,9 +67,6 @@ use utils::{
     lsn::Lsn,
 };
 
-// Imports only used for testing APIs
-use pageserver_api::models::ConfigureFailpointsRequest;
-
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
@@ -1293,34 +1291,6 @@ async fn handle_tenant_break(
     json_response(StatusCode::OK, ())
 }
 
-async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "Cannot manage failpoints because pageserver was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
-
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
     mut request: Request<Body>,
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 58adf6e8c4..c1ce0af47b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,8 +25,6 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;
 
-pub mod failpoint_support;
-
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2f2169d194..e50987c84b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,6 +33,7 @@ use tracing::*;
 use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::failpoint_support;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
@@ -890,7 +891,7 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
-        crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
+        failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
         let preload = match preload {
             Some(p) => p,
@@ -1002,7 +1003,7 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
-        crate::failpoint_support::sleep_millis_async!("attach-before-activate");
+        failpoint_support::sleep_millis_async!("attach-before-activate");
 
         info!("Done");
 
@@ -2839,9 +2840,7 @@ impl Tenant {
             }
         };
 
-        crate::failpoint_support::sleep_millis_async!(
-            "gc_iteration_internal_after_getting_gc_timelines"
-        );
+        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
         // If there is nothing to GC, we don't want any messages in the INFO log.
         if !gc_timelines.is_empty() {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 1d14214030..a6a8972970 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
+use utils::failpoint_support;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -344,9 +345,7 @@ impl<'a> WalIngest<'a> {
                         // particular point in the WAL. For more fine-grained control,
                         // we could peek into the message and only pause if it contains
                         // a particular string, for example, but this is enough for now.
-                        crate::failpoint_support::sleep_millis_async!(
-                            "wal-ingest-logical-message-sleep"
-                        );
+                        failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
                     } else if let Some(path) = prefix.strip_prefix("neon-file:") {
                         modification.put_file(path, message, ctx).await?;
                     }

From e79a19339c2a6b8bb089a64db5e346e8f19a75d2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 23:32:24 +0300
Subject: [PATCH 178/202] Add failpoint support to safekeeper.

Just a copy paste from pageserver.
---
 Cargo.lock                            |  1 +
 safekeeper/Cargo.toml                 |  7 ++++++
 safekeeper/src/bin/safekeeper.rs      | 17 ++++++++++++-
 safekeeper/src/http/routes.rs         |  8 ++++++
 test_runner/fixtures/neon_fixtures.py | 36 +++++++++++++++++++++++----
 5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 73cb83d3a7..55e868a6d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4449,6 +4449,7 @@ dependencies = [
  "clap",
  "const_format",
  "crc32c",
+ "fail",
  "fs2",
  "futures",
  "git-version",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index cccb4ebd79..4015c27933 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -4,6 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
+# which adds some runtime cost to run tests on outage conditions
+testing = ["fail/failpoints"]
+
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
+fail.workspace = true
 fs2.workspace = true
 git-version.workspace = true
 hex.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e59deb9fda..33047051df 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id";
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+const FEATURES: &[&str] = &[
+    #[cfg(feature = "testing")]
+    "testing",
+];
+
+fn version() -> String {
+    format!(
+        "{GIT_VERSION} failpoints: {}, features: {:?}",
+        fail::has_failpoints(),
+        FEATURES,
+    )
+}
+
 const ABOUT: &str = r#"
 A fleet of safekeepers is responsible for reliably storing WAL received from
 compute, passing it through consensus (mitigating potential computes brain
@@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> {
     // getting 'argument cannot be used multiple times' error. This seems to be
     // impossible with pure Derive API, so convert struct to Command, modify it,
     // parse arguments, and then fill the struct back.
-    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
+    let cmd = <Args as clap::CommandFactory>::command()
+        .args_override_self(true)
+        .version(version());
     let mut matches = cmd.get_matches();
     let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index c48b5330b3..25a3334e63 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -12,6 +12,8 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
+use tokio_util::sync::CancellationToken;
+use utils::failpoint_support::failpoints_handler;
 
 use std::io::Write as _;
 use tokio::sync::mpsc;
@@ -444,6 +446,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(Arc::new(conf))
         .data(auth)
         .get("/v1/status", |r| request_span(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            request_span(r, move |r| async {
+                let cancel = CancellationToken::new();
+                failpoints_handler(r, cancel).await
+            })
+        })
         // Will be used in the future instead of implicit timeline creation
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 597e311e02..9aa82d8854 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -890,8 +890,8 @@ class NeonEnv:
         """Get list of safekeeper endpoints suitable for safekeepers GUC"""
         return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
 
-    def get_pageserver_version(self) -> str:
-        bin_pageserver = str(self.neon_binpath / "pageserver")
+    def get_binary_version(self, binary_name: str) -> str:
+        bin_pageserver = str(self.neon_binpath / binary_name)
         res = subprocess.run(
             [bin_pageserver, "--version"],
             check=True,
@@ -1656,7 +1656,7 @@ class NeonPageserver(PgProtocol):
         self.running = False
         self.service_port = port
         self.config_override = config_override
-        self.version = env.get_pageserver_version()
+        self.version = env.get_binary_version("pageserver")
 
         # After a test finishes, we will scrape the log to see if there are any
         # unexpected error messages. If your test expects an error, add it to
@@ -2924,7 +2924,8 @@ class Safekeeper:
                 return res
 
     def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
-        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token)
+        is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
+        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled)
 
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2975,10 +2976,11 @@ class SafekeeperMetrics:
 class SafekeeperHttpClient(requests.Session):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None):
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled = False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
+        self.is_testing_enabled = is_testing_enabled
 
         if auth_token is not None:
             self.headers["Authorization"] = f"Bearer {auth_token}"
@@ -2986,6 +2988,30 @@ class SafekeeperHttpClient(requests.Session):
     def check_status(self):
         self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
 
+    def is_testing_enabled_or_skip(self):
+        if not self.is_testing_enabled:
+            pytest.skip("safekeeper was built without 'testing' feature")
+
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
     def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         params = params or {}
         res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)

From aaaa39d9f52a46641c86314ddc9d15565275d9c2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 29 Dec 2023 23:09:36 +0300
Subject: [PATCH 179/202] Add large insertion and slow WAL sending to
 test_hot_standby.

To exercise MAX_SEND_SIZE sending from safekeeper; we've had a bug with WAL
records torn across several XLogData messages. Add failpoint to safekeeper to
slow down sending. Also check for corrupted WAL complains in standby log.

Make the test a bit simpler in passing, e.g. we don't need explicit commits as
autocommit is enabled by default.

https://neondb.slack.com/archives/C05L7D1JAUS/p1703774799114719
https://github.com/neondatabase/cloud/issues/9057
---
 safekeeper/src/send_wal.rs              |  6 ++
 test_runner/fixtures/neon_fixtures.py   | 17 +++--
 test_runner/regress/test_hot_standby.py | 91 +++++++++++++++----------
 3 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 70590a0f95..bd1d306968 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
+use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::lsn::AtomicLsn;
 use utils::pageserver_feedback::PageserverFeedback;
@@ -559,6 +560,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 }))
                 .await?;
 
+            if let Some(appname) = &self.appname {
+                if appname == "replica" {
+                    failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
+                }
+            }
             trace!(
                 "sent {} bytes of WAL {}-{}",
                 send_size,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9aa82d8854..5b1a8ba27d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -347,7 +347,9 @@ class PgProtocol:
         """
         return self.safe_psql_many([query], **kwargs)[0]
 
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
+    def safe_psql_many(
+        self, queries: List[str], log_query=True, **kwargs: Any
+    ) -> List[List[Tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.
         This method passes all extra params to connstr.
@@ -356,7 +358,8 @@ class PgProtocol:
         with closing(self.connect(**kwargs)) as conn:
             with conn.cursor() as cur:
                 for query in queries:
-                    log.info(f"Executing query: {query}")
+                    if log_query:
+                        log.info(f"Executing query: {query}")
                     cur.execute(query)
 
                     if cur.description is None:
@@ -365,11 +368,11 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
-    def safe_psql_scalar(self, query) -> Any:
+    def safe_psql_scalar(self, query, log_query=True) -> Any:
         """
         Execute query returning single row with single column.
         """
-        return self.safe_psql(query)[0][0]
+        return self.safe_psql(query, log_query=log_query)[0][0]
 
 
 @dataclass
@@ -2925,7 +2928,9 @@ class Safekeeper:
 
     def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
         is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
-        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled)
+        return SafekeeperHttpClient(
+            port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
+        )
 
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2976,7 +2981,7 @@ class SafekeeperMetrics:
 class SafekeeperHttpClient(requests.Session):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled = False):
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 031fd2857d..7822e29ed9 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,19 +1,59 @@
+import os
+import re
 import time
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv
+
+
+def wait_caughtup(primary: Endpoint, secondary: Endpoint):
+    primary_lsn = primary.safe_psql_scalar(
+        "SELECT pg_current_wal_insert_lsn()::text", log_query=False
+    )
+    while True:
+        secondary_lsn = secondary.safe_psql_scalar(
+            "SELECT pg_last_wal_replay_lsn()", log_query=False
+        )
+        caught_up = secondary_lsn >= primary_lsn
+        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
+        if caught_up:
+            return
+        time.sleep(1)
+
+
+# Check for corrupted WAL messages which might otherwise go unnoticed if
+# reconnection fixes this.
+def scan_standby_log_for_errors(secondary):
+    log_path = secondary.endpoint_path() / "compute.log"
+    with log_path.open("r") as f:
+        markers = re.compile(
+            r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
+        )
+        for line in f:
+            if markers.search(line):
+                log.info(f"bad error in standby log: {line}")
+                raise AssertionError()
 
 
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
+    # We've had a bug caused by WAL records split across multiple XLogData
+    # messages resulting in corrupted WAL complains on standby. It reproduced
+    # only when sending from safekeeper is slow enough to grab full
+    # MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in
+    # one conf to decrease test time.
+    slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "")
+    if slow_down_send:
+        sk_http = env.safekeepers[0].http_client()
+        sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")])
+
     with env.endpoints.create_start(
         branch_name="main",
         endpoint_id="primary",
     ) as primary:
         time.sleep(1)
         with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
-            primary_lsn = None
-            caught_up = False
             queries = [
                 "SHOW neon.timeline_id",
                 "SHOW neon.tenant_id",
@@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                 with p_con.cursor() as p_cur:
                     p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
 
-                # Explicit commit to make sure other connections (and replicas) can
-                # see the changes of this commit.
-                p_con.commit()
-
-                with p_con.cursor() as p_cur:
-                    p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
-                    res = p_cur.fetchone()
-                    assert res is not None
-                    (lsn,) = res
-                    primary_lsn = lsn
-
-                # Explicit commit to make sure other connections (and replicas) can
-                # see the changes of this commit.
-                # Note that this may generate more WAL if the transaction has changed
-                # things, but we don't care about that.
-                p_con.commit()
-
                 for query in queries:
                     with p_con.cursor() as p_cur:
                         p_cur.execute(query)
@@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                         response = res
                         responses[query] = response
 
+                # insert more data to make safekeeper send MAX_SEND_SIZE messages
+                if slow_down_send:
+                    primary.safe_psql("create table t(key int, value text)")
+                    primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
+
+            wait_caughtup(primary, secondary)
+
             with secondary.connect() as s_con:
                 with s_con.cursor() as s_cur:
                     s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
                     res = s_cur.fetchone()
                     assert res is not None
 
-                while not caught_up:
-                    with s_con.cursor() as secondary_cursor:
-                        secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
-                        res = secondary_cursor.fetchone()
-                        assert res is not None
-                        (secondary_lsn,) = res
-                        # There may be more changes on the primary after we got our LSN
-                        # due to e.g. autovacuum, but that shouldn't impact the content
-                        # of the tables, so we check whether we've replayed up to at
-                        # least after the commit of the `test` table.
-                        caught_up = secondary_lsn >= primary_lsn
-
-                # Explicit commit to flush any transient transaction-level state.
-                s_con.commit()
-
                 for query in queries:
                     with s_con.cursor() as secondary_cursor:
                         secondary_cursor.execute(query)
                         response = secondary_cursor.fetchone()
                         assert response is not None
                         assert response == responses[query]
+
+            scan_standby_log_for_errors(secondary)
+
+    # clean up
+    if slow_down_send:
+        sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))

From e92c9f42c05a6c10b53269999c4555d9c8a8f6c9 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 30 Dec 2023 00:31:19 +0300
Subject: [PATCH 180/202] Don't split WAL record across two XLogData's when
 sending from safekeepers.

As protocol demands. Not following this makes standby complain about corrupted
WAL in various ways.

https://neondb.slack.com/archives/C05L7D1JAUS/p1703774799114719
closes https://github.com/neondatabase/cloud/issues/9057
---
 safekeeper/src/send_wal.rs    | 22 +++++++++++++++-------
 safekeeper/src/wal_storage.rs |  3 +++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index bd1d306968..9a5657a40d 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -529,12 +529,19 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
             );
 
             // try to send as much as available, capped by MAX_SEND_SIZE
-            let mut send_size = self
-                .end_pos
-                .checked_sub(self.start_pos)
-                .context("reading wal without waiting for it first")?
-                .0 as usize;
-            send_size = min(send_size, self.send_buf.len());
+            let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
+            // if we went behind available WAL, back off
+            if chunk_end_pos >= self.end_pos {
+                chunk_end_pos = self.end_pos;
+            } else {
+                // If sending not up to end pos, round down to page boundary to
+                // avoid breaking WAL record not at page boundary, as protocol
+                // demands. See walsender.c (XLogSendPhysical).
+                chunk_end_pos = chunk_end_pos
+                    .checked_sub(chunk_end_pos.block_offset())
+                    .unwrap();
+            }
+            let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
             let send_buf = &mut self.send_buf[..send_size];
             let send_size: usize;
             {
@@ -545,7 +552,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 } else {
                     None
                 };
-                // read wal into buffer
+                // Read WAL into buffer. send_size can be additionally capped to
+                // segment boundary here.
                 send_size = self.wal_reader.read(send_buf).await?
             };
             let send_buf = &send_buf[..send_size];
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index fa44b24258..e7538f805c 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -565,6 +565,9 @@ impl WalReader {
         })
     }
 
+    /// Read WAL at current position into provided buf, returns number of bytes
+    /// read. It can be smaller than buf size only if segment boundary is
+    /// reached.
     pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
         // If this timeline is new, we may not have a full segment yet, so
         // we pad the first bytes of the timeline's first WAL segment with 0s

From ea9fad419eceb7af4d340e308b25727c39eeb622 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 5 Dec 2023 15:04:26 -0800
Subject: [PATCH 181/202] Add exponential backoff to page_server->send

---
 pgxn/neon/pagestore_smgr.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 8888cd89c6..6cf2762179 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -275,6 +275,26 @@ static inline void prefetch_set_unused(uint64 ring_index);
 static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 									   ForkNumber forknum, BlockNumber blkno);
 
+
+#define INITIAL_EXPONENTIAL_BACKOFF_DELAY 1000
+#define EXPONENTIAL_BACKOFF_EXPONENT 2
+#define MAX_EXPONENTIAL_BACKOFF_DELAY (1000*1000)
+
+static void
+InitExponentialBackoff(long *delay)
+{
+	*delay = INITIAL_EXPONENTIAL_BACKOFF_DELAY;
+}
+
+static void
+PerformExponentialBackoff(long *delay)
+{
+	pg_usleep(*delay);
+	*delay *= EXPONENTIAL_BACKOFF_EXPONENT;
+	if(*delay >= MAX_EXPONENTIAL_BACKOFF_DELAY)
+		*delay = MAX_EXPONENTIAL_BACKOFF_DELAY;
+}
+
 static bool
 compact_prefetch_buffers(void)
 {
@@ -662,6 +682,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
+        long backoff_delay_us;
 
 	if (force_lsn && force_latest)
 	{
@@ -704,7 +725,11 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
 
-	while (!page_server->send((NeonRequest *) &request));
+	InitExponentialBackoff(&backoff_delay_us);
+	while (!page_server->send((NeonRequest *) &request))
+	{
+		PerformExponentialBackoff(&backoff_delay_us);
+	}
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;

From 091a0cda9d2b309f06189b4d976bb91fd3de7dc1 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 15 Dec 2023 12:35:38 -0800
Subject: [PATCH 182/202] Switch to rate-limiting strategy

---
 pgxn/neon/libpagestore.c   | 15 ++++++++++++++-
 pgxn/neon/pagestore_smgr.c | 27 +--------------------------
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3b038f906f..5db9e5e08e 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -133,6 +133,9 @@ pageserver_connect(int elevel)
 	const char *values[3];
 	int			n;
 
+	static TimestampTz last_connect_time = 0;
+	TimestampTz now;
+
 	Assert(!connected);
 
 	if (CheckConnstringUpdated())
@@ -140,6 +143,17 @@ pageserver_connect(int elevel)
 		ReloadConnstring();
 	}
 
+	now = GetCurrentTimestamp();
+	if ((now - last_connect_time) < RECONNECT_INTERVAL_USEC)
+	{
+		pg_usleep(RECONNECT_INTERVAL_USEC);
+		last_connect_time = GetCurrentTimestamp();
+	}
+	else
+	{
+		last_connect_time = now;
+	}
+
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -333,7 +347,6 @@ pageserver_send(NeonRequest *request)
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
-			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
 		n_reconnect_attempts = 0;
 	}
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 6cf2762179..8888cd89c6 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -275,26 +275,6 @@ static inline void prefetch_set_unused(uint64 ring_index);
 static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 									   ForkNumber forknum, BlockNumber blkno);
 
-
-#define INITIAL_EXPONENTIAL_BACKOFF_DELAY 1000
-#define EXPONENTIAL_BACKOFF_EXPONENT 2
-#define MAX_EXPONENTIAL_BACKOFF_DELAY (1000*1000)
-
-static void
-InitExponentialBackoff(long *delay)
-{
-	*delay = INITIAL_EXPONENTIAL_BACKOFF_DELAY;
-}
-
-static void
-PerformExponentialBackoff(long *delay)
-{
-	pg_usleep(*delay);
-	*delay *= EXPONENTIAL_BACKOFF_EXPONENT;
-	if(*delay >= MAX_EXPONENTIAL_BACKOFF_DELAY)
-		*delay = MAX_EXPONENTIAL_BACKOFF_DELAY;
-}
-
 static bool
 compact_prefetch_buffers(void)
 {
@@ -682,7 +662,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
-        long backoff_delay_us;
 
 	if (force_lsn && force_latest)
 	{
@@ -725,11 +704,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
 
-	InitExponentialBackoff(&backoff_delay_us);
-	while (!page_server->send((NeonRequest *) &request))
-	{
-		PerformExponentialBackoff(&backoff_delay_us);
-	}
+	while (!page_server->send((NeonRequest *) &request));
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;

From 4e1d16f3112f6296f5e4616a0fb67a60adde9dbd Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 26 Dec 2023 14:14:37 -0800
Subject: [PATCH 183/202] Switch to exponential rate-limiting

---
 pgxn/neon/libpagestore.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5db9e5e08e..574e68af66 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,7 +35,8 @@
 
 #define PageStoreTrace DEBUG5
 
-#define RECONNECT_INTERVAL_USEC 1000000
+#define MAX_RECONNECT_INTERVAL_USEC 100
+#define MAX_RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;
@@ -134,7 +135,9 @@ pageserver_connect(int elevel)
 	int			n;
 
 	static TimestampTz last_connect_time = 0;
+	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
+        uint64_t us_since_last_connect;
 
 	Assert(!connected);
 
@@ -144,13 +147,18 @@ pageserver_connect(int elevel)
 	}
 
 	now = GetCurrentTimestamp();
-	if ((now - last_connect_time) < RECONNECT_INTERVAL_USEC)
+        us_since_last_connect = now - last_connect_time;
+	if (us_since_last_connect < delay_us)
 	{
-		pg_usleep(RECONNECT_INTERVAL_USEC);
+		pg_usleep(delay_us - us_since_last_connect);
+		delay_us *= 2;
+		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
+			delay_us = MAX_RECONNECT_INTERVAL_USEC;
 		last_connect_time = GetCurrentTimestamp();
 	}
 	else
 	{
+		delay_us = MIN_RECONNECT_INTERVAL_USEC;
 		last_connect_time = now;
 	}
 

From ce13281d542be2d6fc50c7884fbe5f541cd5a64d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Wed, 27 Dec 2023 08:52:46 -0800
Subject: [PATCH 184/202] MIN not MAX

---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 574e68af66..e58c28d7d5 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,7 +35,7 @@
 
 #define PageStoreTrace DEBUG5
 
-#define MAX_RECONNECT_INTERVAL_USEC 100
+#define MIN_RECONNECT_INTERVAL_USEC 100
 #define MAX_RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;

From 946c6a000686ce8d790deaa74fafa6d1ed72e372 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 2 Jan 2024 17:22:16 +0200
Subject: [PATCH 185/202] scrubber: use adaptive config with retries, check
 subset of tenants (#6219)

The tool still needs a lot of work. These are the easiest fix and
feature:
- use similar adaptive config with s3 as remote_storage, use retries
- process only particular tenants

Tenants need to be from the correct region, they are not deduplicated,
but the feature is useful for re-checking small amount of tenants after
a large run.
---
 Cargo.lock                       |  2 ++
 s3_scrubber/Cargo.toml           |  3 ++
 s3_scrubber/src/lib.rs           |  8 ++++-
 s3_scrubber/src/main.rs          | 57 +++++++++++++++++---------------
 s3_scrubber/src/scan_metadata.rs | 11 ++++--
 5 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 55e868a6d5..93efbadd79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4405,12 +4405,14 @@ dependencies = [
  "async-stream",
  "aws-config",
  "aws-sdk-s3",
+ "aws-smithy-async",
  "bincode",
  "bytes",
  "chrono",
  "clap",
  "crc32c",
  "either",
+ "futures",
  "futures-util",
  "hex",
  "histogram",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index fdae378d55..4d136472e0 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true
 
 [dependencies]
 aws-sdk-s3.workspace = true
+aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
@@ -39,3 +40,5 @@ tracing-subscriber.workspace = true
 clap.workspace = true
 tracing-appender = "0.2"
 histogram = "0.7"
+
+futures.workspace = true
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 8fb1346c8e..d2842877d0 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -16,10 +16,12 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::retry::RetryConfig;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::Region;
+use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
+use aws_smithy_async::rt::sleep::TokioSleep;
 
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
@@ -283,9 +285,13 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         )
     };
 
+    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
     let mut builder = Config::builder()
         .behavior_version(BehaviorVersion::v2023_11_09())
         .region(bucket_region)
+        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
+        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
         .credentials_provider(credentials_provider);
 
     if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index ef020edc2a..957213856b 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,3 +1,4 @@
+use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
 use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
@@ -34,6 +35,8 @@ enum Command {
     ScanMetadata {
         #[arg(short, long, default_value_t = false)]
         json: bool,
+        #[arg(long = "tenant-id", num_args = 0..)]
+        tenant_ids: Vec<TenantShardId>,
     },
 }
 
@@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
-            Err(e) => {
-                tracing::error!("Failed: {e}");
-                Err(e)
-            }
-            Ok(summary) => {
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
+        Command::ScanMetadata { json, tenant_ids } => {
+            match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                Err(e) => {
+                    tracing::error!("Failed: {e}");
+                    Err(e)
                 }
-                if summary.is_fatal() {
-                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                } else if summary.is_empty() {
-                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                    // scrubber they were likely expecting to scan something, and if we see no timelines
-                    // at all then it's likely due to some configuration issues like a bad prefix
-                    Err(anyhow::anyhow!(
-                        "No timelines found in bucket {} prefix {}",
-                        bucket_config.bucket,
-                        bucket_config
-                            .prefix_in_bucket
-                            .unwrap_or("<none>".to_string())
-                    ))
-                } else {
-                    Ok(())
+                Ok(summary) => {
+                    if json {
+                        println!("{}", serde_json::to_string(&summary).unwrap())
+                    } else {
+                        println!("{}", summary.summary_string());
+                    }
+                    if summary.is_fatal() {
+                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                    } else if summary.is_empty() {
+                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                        // scrubber they were likely expecting to scan something, and if we see no timelines
+                        // at all then it's likely due to some configuration issues like a bad prefix
+                        Err(anyhow::anyhow!(
+                            "No timelines found in bucket {} prefix {}",
+                            bucket_config.bucket,
+                            bucket_config
+                                .prefix_in_bucket
+                                .unwrap_or("<none>".to_string())
+                        ))
+                    } else {
+                        Ok(())
+                    }
                 }
             }
-        },
+        }
         Command::FindGarbage {
             node_kind,
             depth,
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index bcc4d2e618..bfde8f0213 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -187,10 +187,17 @@ Timeline layer count: {6}
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
+pub async fn scan_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantShardId>,
+) -> anyhow::Result<MetadataSummary> {
     let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
 
-    let tenants = stream_tenants(&s3_client, &target);
+    let tenants = if tenant_ids.is_empty() {
+        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+    } else {
+        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+    };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
     // accessing the same per tenant prefixes, so use a lower setting than pageservers.

From aa9f1d4b697eefb48e4ebff4aebb4881d2bb29c4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 2 Jan 2024 17:57:29 +0100
Subject: [PATCH 186/202] pagebench get-page: default to latest=true, make
 configurable via flag (#6252)

fixes https://github.com/neondatabase/neon/issues/6209
---
 pageserver/client/src/page_service.rs         |  9 +----
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 38 +++++++++++++------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index fc0d2311f7..231461267a 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -115,15 +115,8 @@ impl PagestreamClient {
 
     pub async fn getpage(
         &mut self,
-        key: RelTagBlockNo,
-        lsn: Lsn,
+        req: PagestreamGetPageRequest,
     ) -> anyhow::Result<PagestreamGetPageResponse> {
-        let req = PagestreamGetPageRequest {
-            latest: false,
-            rel: key.rel_tag,
-            blkno: key.block_no,
-            lsn,
-        };
         let req = PagestreamFeMessage::GetPage(req);
         let req: bytes::Bytes = req.serialize();
         // let mut req = tokio_util::io::ReaderStream::new(&req);
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 16d198ab0e..cb36a403f1 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -3,7 +3,7 @@ use futures::future::join_all;
 use pageserver::pgdatadir_mapping::key_to_rel_block;
 use pageserver::repository;
 use pageserver_api::key::is_rel_block_key;
-use pageserver_client::page_service::RelTagBlockNo;
+use pageserver_api::models::PagestreamGetPageRequest;
 
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -39,6 +39,9 @@ pub(crate) struct Args {
     runtime: Option<humantime::Duration>,
     #[clap(long)]
     per_target_rate_limit: Option<usize>,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long, default_value = "1")]
+    req_latest_probability: f64,
     #[clap(long)]
     limit_to_first_n_targets: Option<usize>,
     targets: Option<Vec<TenantTimelineId>>,
@@ -200,18 +203,26 @@ async fn main_impl(
             start_work_barrier.wait().await;
 
             loop {
-                let (range, key) = {
+                let (timeline, req) = {
                     let mut rng = rand::thread_rng();
                     let r = &all_ranges[weights.sample(&mut rng)];
                     let key: i128 = rng.gen_range(r.start..r.end);
                     let key = repository::Key::from_i128(key);
                     let (rel_tag, block_no) =
                         key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                    (r, RelTagBlockNo { rel_tag, block_no })
+                    (
+                        r.timeline,
+                        PagestreamGetPageRequest {
+                            latest: rng.gen_bool(args.req_latest_probability),
+                            lsn: r.timeline_lsn,
+                            rel: rel_tag,
+                            blkno: block_no,
+                        },
+                    )
                 };
-                let sender = work_senders.get(&range.timeline).unwrap();
+                let sender = work_senders.get(&timeline).unwrap();
                 // TODO: what if this blocks?
-                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                sender.send(req).await.ok().unwrap();
             }
         }),
         Some(rps_limit) => Box::pin(async move {
@@ -240,16 +251,21 @@ async fn main_impl(
                     );
                     loop {
                         ticker.tick().await;
-                        let (range, key) = {
+                        let req = {
                             let mut rng = rand::thread_rng();
                             let r = &ranges[weights.sample(&mut rng)];
                             let key: i128 = rng.gen_range(r.start..r.end);
                             let key = repository::Key::from_i128(key);
                             let (rel_tag, block_no) = key_to_rel_block(key)
                                 .expect("we filter non-rel-block keys out above");
-                            (r, RelTagBlockNo { rel_tag, block_no })
+                            PagestreamGetPageRequest {
+                                latest: rng.gen_bool(args.req_latest_probability),
+                                lsn: r.timeline_lsn,
+                                rel: rel_tag,
+                                blkno: block_no,
+                            }
                         };
-                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                        sender.send(req).await.ok().unwrap();
                     }
                 })
             };
@@ -303,7 +319,7 @@ async fn client(
     args: &'static Args,
     timeline: TenantTimelineId,
     start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
     all_work_done_barrier: Arc<Barrier>,
     live_stats: Arc<LiveStats>,
 ) {
@@ -317,10 +333,10 @@ async fn client(
         .await
         .unwrap();
 
-    while let Some((key, lsn)) = work.recv().await {
+    while let Some(req) = work.recv().await {
         let start = Instant::now();
         client
-            .getpage(key, lsn)
+            .getpage(req)
             .await
             .with_context(|| format!("getpage for {timeline}"))
             .unwrap();

From ae3eaf99957433b2df51aa79fb7b63f6959156f9 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 2 Jan 2024 20:27:53 +0300
Subject: [PATCH 187/202] Add [WP] prefix to all walproposer logging.

- rename walpop_log to wp_log
- create also wpg_log which is used in postgres-specific code
- in passing format messages to start with lower case
---
 libs/walproposer/src/walproposer.rs |   2 +-
 pgxn/neon/walproposer.c             | 240 ++++++++++++++--------------
 pgxn/neon/walproposer.h             |  16 +-
 pgxn/neon/walproposer_pg.c          |  68 ++++----
 4 files changed, 169 insertions(+), 157 deletions(-)

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 35c8f6904d..7251545792 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -425,7 +425,7 @@ mod tests {
         }
 
         fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("walprop_log[{}] {}", level, msg);
+            println!("wp_log[{}] {}", level, msg);
         }
 
         fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 7fb0cab9a0..2ea724f927 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		port = strchr(host, ':');
 		if (port == NULL)
 		{
-			walprop_log(FATAL, "port is not specified");
+			wp_log(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 			*sep++ = '\0';
 		if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			walprop_log(FATAL, "Too many safekeepers");
+			wp_log(FATAL, "too many safekeepers");
 		}
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
 							   sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
-				walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
+				wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}
 
 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	if (wp->n_safekeepers < 1)
 	{
-		walprop_log(FATAL, "Safekeepers addresses are not specified");
+		wp_log(FATAL, "safekeepers addresses are not specified");
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;
 
@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
 	wp->greetRequest.systemId = wp->config->systemId;
 	if (!wp->config->neon_timeline)
-		walprop_log(FATAL, "neon.timeline_id is not provided");
+		wp_log(FATAL, "neon.timeline_id is not provided");
 	if (*wp->config->neon_timeline != '\0' &&
 		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
 	if (!wp->config->neon_tenant)
-		walprop_log(FATAL, "neon.tenant_id is not provided");
+		wp_log(FATAL, "neon.tenant_id is not provided");
 	if (*wp->config->neon_tenant != '\0' &&
 		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
+		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
 
 	wp->greetRequest.timeline = wp->config->pgTimeline;
 	wp->greetRequest.walSegSize = wp->config->wal_segment_size;
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wp->config->safekeeper_connection_timeout))
 				{
-					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
+					wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
+						   sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
 		 *
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
-		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 
 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
+	wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
 
 	sk->state = SS_CONNECTING_WRITE;
 	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
-#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
+#ifdef WALPROPOSER_LIB			/* wp_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
 #endif
 
@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-						sk->host, sk->port);
+			wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
+				   sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */
 
@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * requests.
 			 */
 		case SS_VOTING:
-			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk));
+			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+				   sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk));
+			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+				   sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			walprop_log(LOG, "connected with node %s:%s", sk->host,
-						sk->port);
+			wp_log(LOG, "connected with node %s:%s", sk->host,
+				   sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
 
 			/*
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;
 
 		case WP_CONN_POLLING_FAILED:
-			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to connect to node '%s:%s': %s",
+				   sk->host, sk->port, wp->api.conn_error_message(sk));
 
 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)
 
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
-		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			break;
 
 		case WP_EXEC_FAILED:
-			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
+				   sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;
 
@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 * wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-						sk->host, sk->port);
+			wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
+				   sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
-	walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		if (wp->n_connected == wp->quorum)
 		{
 			wp->propTerm++;
-			walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
+			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
 
 			wp->voteRequest = (VoteRequest)
 			{
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	else if (sk->greetResponse.term > wp->propTerm)
 	{
 		/* Another compute with higher term is running. */
-		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->greetResponse.term, wp->propTerm);
+		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+			   sk->host, sk->port,
+			   sk->greetResponse.term, wp->propTerm);
 	}
 
 	/*
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
 	WalProposer *wp = sk->wp;
 
 	/* We have quorum for voting, send our vote request */
-	walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
 		return;
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;
 
-	walprop_log(LOG,
-				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+	wp_log(LOG,
+		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
 	if ((!sk->voteResponse.voteGiven) &&
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
-		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->voteResponse.term, wp->propTerm);
+		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+			   sk->host, sk->port,
+			   sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);
 
@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
 	 */
 	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
+		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
 
 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
-					walprop_log(WARNING,
-								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-								LSN_FORMAT_ARGS(wp->timelineStartLsn),
-								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+					wp_log(WARNING,
+						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
+						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 		{
 			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
 		}
-		walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
 
 	/*
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
 
-	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-				wp->quorum,
-				wp->propTerm,
-				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-				LSN_FORMAT_ARGS(wp->truncateLsn));
+	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+		   wp->quorum,
+		   wp->propTerm,
+		   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				 * scenario.
 				 */
 				disable_core_dump();
-				walprop_log(PANIC,
-							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+				wp_log(PANIC,
+					   "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+					   LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
@@ -1115,9 +1115,9 @@ SendProposerElected(Safekeeper *sk)
 			 */
 			sk->startStreamingAt = wp->truncateLsn;
 
-			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-						LSN_FORMAT_ARGS(sk->startStreamingAt));
+			wp_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
+				   sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
+				   LSN_FORMAT_ARGS(sk->startStreamingAt));
 		}
 	}
 	else
@@ -1150,9 +1150,9 @@ SendProposerElected(Safekeeper *sk)
 	msg.timelineStartLsn = wp->timelineStartLsn;
 
 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
-	walprop_log(LOG,
-				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+	wp_log(LOG,
+		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
 
 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1261,8 +1261,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
 	if (events & WL_SOCKET_CLOSED)
 	{
-		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
-					sk->host, sk->port);
+		wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+			   sk->host, sk->port);
 		ShutdownConnection(sk);
 		return;
 	}
@@ -1323,12 +1323,12 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
 
-			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						req->endLsn - req->beginLsn,
-						LSN_FORMAT_ARGS(req->beginLsn),
-						LSN_FORMAT_ARGS(req->endLsn),
-						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+			wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				   req->endLsn - req->beginLsn,
+				   LSN_FORMAT_ARGS(req->beginLsn),
+				   LSN_FORMAT_ARGS(req->endLsn),
+				   LSN_FORMAT_ARGS(req->commitLsn),
+				   LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
 
 			resetStringInfo(&sk->outbuf);
 
@@ -1355,8 +1355,8 @@ SendAppendRequests(Safekeeper *sk)
 				case NEON_WALREAD_WOULDBLOCK:
 					return true;
 				case NEON_WALREAD_ERROR:
-					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
-								sk->host, sk->port, errmsg);
+					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
+						   sk->host, sk->port, errmsg);
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1388,9 +1388,9 @@ SendAppendRequests(Safekeeper *sk)
 					return true;
 
 				case PG_ASYNC_WRITE_FAIL:
-					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-								sk->host, sk->port, FormatSafekeeperState(sk),
-								wp->api.conn_error_message(sk));
+					wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+						   sk->host, sk->port, FormatSafekeeperState(sk),
+						   wp->api.conn_error_message(sk));
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1429,11 +1429,11 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-					sk->appendResponse.term,
-					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-					sk->host, sk->port);
+		wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+			   sk->appendResponse.term,
+			   LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+			   sk->host, sk->port);
 
 		if (sk->appendResponse.term > wp->propTerm)
 		{
@@ -1443,9 +1443,9 @@ RecvAppendResponses(Safekeeper *sk)
 			 * core as this is kinda expected scenario.
 			 */
 			disable_core_dump();
-			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-						sk->host, sk->port,
-						sk->appendResponse.term, wp->propTerm);
+			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+				   sk->host, sk->port,
+				   sk->appendResponse.term, wp->propTerm);
 		}
 
 		readAnything = true;
@@ -1489,32 +1489,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-						rf->currentClusterSize);
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
+				   rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->last_received_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1526,8 +1526,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-							rf->replytime, replyTimeStr);
+				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+					   rf->replytime, replyTimeStr);
 
 				pfree(replyTimeStr);
 			}
@@ -1541,7 +1541,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1606,7 +1606,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 
 	if (wp->n_votes < wp->quorum)
 	{
-		walprop_log(WARNING, "GetDonor called before elections are won");
+		wp_log(WARNING, "GetDonor called before elections are won");
 		return NULL;
 	}
 
@@ -1734,9 +1734,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
+				   sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1774,8 +1774,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk));
+		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			   sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1851,9 +1851,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
-		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk),
-					wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+			   sk->host, sk->port, FormatSafekeeperState(sk),
+			   wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1904,9 +1904,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
-			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+				   sk->host, sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1943,9 +1943,9 @@ AsyncFlush(Safekeeper *sk)
 			/* Nothing to do; try again when the socket's ready */
 			return false;
 		case -1:
-			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
+				   sk->host, sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -1974,11 +1974,11 @@ CompareLsn(const void *a, const void *b)
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
+ *   wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
+ *   wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
  */
 static char *
 FormatSafekeeperState(Safekeeper *sk)
@@ -2059,8 +2059,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * To give a descriptive message in the case of failure, we use elog
 		 * and then an assertion that's guaranteed to fail.
 		 */
-		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
+		wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			   FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2199,8 +2199,8 @@ FormatEvents(WalProposer *wp, uint32 events)
 
 	if (events & (~all_flags))
 	{
-		walprop_log(WARNING, "Event formatting found unexpected component %d",
-					events & (~all_flags));
+		wp_log(WARNING, "event formatting found unexpected component %d",
+			   events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 6d478076fe..688d8e6e52 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -707,11 +707,23 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */
 
+#define WP_LOG_PREFIX "[WP] "
+
+/*
+ * wp_log is used in pure wp code (walproposer.c), allowing API callback to
+ * catch logging.
+ */
 #ifdef WALPROPOSER_LIB
 extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
-#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
+#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
 #else
-#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
+#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
 #endif
 
+/*
+ * And wpg_log is used all other (postgres specific) walproposer code, just
+ * adding prefix.
+ */
+#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
+
 #endif							/* __NEON_WALPROPOSER_H__ */
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7773aabfab..a3edffa6cb 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
 
-	elog(LOG, "WAL proposer starts streaming at %X/%X",
-		 LSN_FORMAT_ARGS(startpos));
+	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
+			LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = wp->greetRequest.timeline;
 	cmd.startpoint = startpos;
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
 {
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
-		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+		wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
 /* Helper function */
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
 	 * PGconn structure"
 	 */
 	if (!pg_conn)
-		elog(FATAL, "failed to allocate new PGconn object");
+		wpg_log(FATAL, "failed to allocate new PGconn object");
 
 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
 			 * unused. We'll expect it's never returned.
 			 */
 		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
 
 			/*
 			 * This return is never actually reached, but it's here to make
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	 */
 	if (!result)
 	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
 		return WP_EXEC_UNEXPECTED_SUCCESS;
 	}
 
@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	}
 
 	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
 
 	return return_val;
 }
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
 
 				if (status != PGRES_FATAL_ERROR)
-					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+					wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
 
 				/*
 				 * If there was actually an error, it'll be properly reported
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
+			wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
 	}
 
 	/*
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
+			wpg_log(FATAL, "invalid return %d from PQflush", result);
 	}
 }
 
@@ -1247,8 +1247,8 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 	{
 		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
-		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
-					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
+		wpg_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
+				LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
 	}
 	timeline = wp->greetRequest.timeline;
 
@@ -1262,7 +1262,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 
 		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
 		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not append password to the safekeeper connection string");
+			wpg_log(FATAL, "could not append password to the safekeeper connection string");
 	}
 
 #if PG_MAJORVERSION_NUM < 16
@@ -1279,11 +1279,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 						err)));
 		return false;
 	}
-	elog(LOG,
-		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
-		 "%d",
-		 sk->host, sk->port, (uint32) (startpos >> 32),
-		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+	wpg_log(LOG,
+			"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
+			"%d",
+			sk->host, sk->port, (uint32) (startpos >> 32),
+			(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
 	options.startpoint = startpos;
@@ -1481,11 +1481,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
 	char		log_prefix[64];
 
-	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
+	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
 	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
-		elog(FATAL, "Failed to allocate xlog reader");
+		wpg_log(FATAL, "failed to allocate xlog reader");
 }
 
 static NeonWALReadResult
@@ -1549,7 +1549,7 @@ static void
 walprop_pg_init_event_set(WalProposer *wp)
 {
 	if (waitEvents)
-		elog(FATAL, "double-initialization of event set");
+		wpg_log(FATAL, "double-initialization of event set");
 
 	/* for each sk, we have socket plus potentially socket for neon walreader */
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
@@ -1581,7 +1581,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
 	Assert(sk->nwrEventPos == -1);
 	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
 	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
-	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
+	wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
 static void
@@ -1680,8 +1680,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 {
 	WalProposer *wp = to_remove->wp;
 
-	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-		 to_remove->host, to_remove->port, is_sk);
+	wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+			to_remove->host, to_remove->port, is_sk);
 
 	/*
 	 * Shortpath for exiting if have nothing to do. We never call this
@@ -1835,13 +1835,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
 	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
 
-	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-		 rf->currentClusterSize,
-		 LSN_FORMAT_ARGS(rf->last_received_lsn),
-		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-		 rf->replytime);
+	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
+			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
+			rf->currentClusterSize,
+			LSN_FORMAT_ARGS(rf->last_received_lsn),
+			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+			rf->replytime);
 }
 
 /*
@@ -1987,7 +1987,7 @@ GetLogRepRestartLSN(WalProposer *wp)
 		{
 			uint64		download_range_mb;
 
-			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 
 			/*
 			 * If we need to download more than a max_slot_wal_keep_size,
@@ -1999,8 +1999,8 @@ GetLogRepRestartLSN(WalProposer *wp)
 			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
 			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 			{
-				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
 				return InvalidXLogRecPtr;
 			}
 

From f71110383c6647c0bf81f1c1f516a7c3decc8d66 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 2 Jan 2024 10:10:41 +0300
Subject: [PATCH 188/202] Remove second check for max_slot_wal_keep_size
 download size.

Already checked in GetLogRepRestartLSN, a rebase artifact.
---
 pgxn/neon/walproposer_pg.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index a3edffa6cb..61a2a54809 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1237,19 +1237,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 		return true;			/* recovery not needed */
 	endpos = wp->propEpochStartLsn;
 
-	/*
-	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
-	 * avoid risk of exploding pg_wal. Logical replication won't work until
-	 * recreated, but at least compute would start; this also follows
-	 * max_slot_wal_keep_size semantics.
-	 */
-	download_range_mb = (endpos - startpos) / 1024 / 1024;
-	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
-	{
-		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
-		wpg_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
-				LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
-	}
 	timeline = wp->greetRequest.timeline;
 
 	if (!neon_auth_token)

From 42f41afcbd96be8dae785329495a7eed6cf55876 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jan 2024 10:36:53 +0000
Subject: [PATCH 189/202] tests: update pytest and boto3 dependencies (#6253)

## Problem

The version of pytest we were using emits a number of
DeprecationWarnings on latest python: these are fixed in latest release.

boto3 and python-dateutil also have deprecation warnings, but
unfortunately these aren't fixed upstream yet.



## Summary of changes

- Update pytest
- Update boto3 (this doesn't fix deprecation warnings, but by the time I
figured that out I had already done the update, and it's good hygiene
anyway)
---
 poetry.lock    | 49 ++++++++++++++++++++++++++-----------------------
 pyproject.toml |  4 ++--
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 76dfd6d37d..c597d811bd 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -339,19 +339,19 @@ uvloop = ["uvloop (>=0.15.2)"]
 
 [[package]]
 name = "boto3"
-version = "1.26.16"
+version = "1.34.11"
 description = "The AWS SDK for Python"
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
-    {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
+    {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
+    {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
 ]
 
 [package.dependencies]
-botocore = ">=1.29.16,<1.30.0"
+botocore = ">=1.34.11,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.6.0,<0.7.0"
+s3transfer = ">=0.10.0,<0.11.0"
 
 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -702,22 +702,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
 
 [[package]]
 name = "botocore"
-version = "1.29.16"
+version = "1.34.11"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
-    {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
+    {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
+    {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
 ]
 
 [package.dependencies]
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
-urllib3 = ">=1.25.4,<1.27"
+urllib3 = [
+    {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
+    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
+]
 
 [package.extras]
-crt = ["awscrt (==0.14.0)"]
+crt = ["awscrt (==0.19.19)"]
 
 [[package]]
 name = "botocore-stubs"
@@ -1889,13 +1892,13 @@ files = [
 
 [[package]]
 name = "pytest"
-version = "7.3.1"
+version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
-    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
 ]
 
 [package.dependencies]
@@ -1907,7 +1910,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "pytest-asyncio"
@@ -2230,20 +2233,20 @@ files = [
 
 [[package]]
 name = "s3transfer"
-version = "0.6.0"
+version = "0.10.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
-    {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
+    {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
+    {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
 ]
 
 [package.dependencies]
-botocore = ">=1.12.36,<2.0a.0"
+botocore = ">=1.33.2,<2.0a.0"
 
 [package.extras]
-crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
+crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
 
 [[package]]
 name = "sarif-om"
@@ -2740,4 +2743,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
+content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada"
diff --git a/pyproject.toml b/pyproject.toml
index 401acaeba4..18c8ece4a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ authors = []
 
 [tool.poetry.dependencies]
 python = "^3.9"
-pytest = "^7.3.1"
+pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
 Jinja2 = "^3.0.2"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
-boto3 = "^1.26.16"
+boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^2.2.1"

From fb518aea0db046817987a463b1556ad950e97f09 Mon Sep 17 00:00:00 2001
From: Cuong Nguyen <ctring23@gmail.com>
Date: Wed, 3 Jan 2024 05:41:58 -0500
Subject: [PATCH 190/202] Add batch ingestion mechanism to avoid high
 contention (#5886)

## Problem
For context, this problem was observed in a research project where we
try to make neon run in multiple regions and I was asked by @hlinnaka to
make this PR.

In our project, we use the pageserver in a non-conventional way such
that we would send a larger number of requests to the pageserver than
normal (imagine postgres without the buffer pool). I measured the time
from the moment a WAL record left the safekeeper to when it reached the
pageserver
([code](https://github.com/umd-dslam/sunstorm-neon/blob/e593db1f5ab2505eb176c9faaf2e9b9ba36cb2c4/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs#L282-L287))
and observed that when the number of get_page_at_lsn requests was high,
the wal receiving time increased significantly (see the left side of the
graphs below).

Upon further investigation, I found that the delay was caused by this
line


https://github.com/neondatabase/neon/blob/d2ca4109191e92a9da340184e5bc71768853fe8e/pageserver/src/tenant/timeline.rs#L2348

The `get_layer_for_write` method is called for every value during WAL
ingestion and it tries to acquire layers write lock every time, thus
this results in high contention when read lock is acquired more
frequently.


![Untitled](https://github.com/neondatabase/neon/assets/6244849/85460f4d-ead1-4532-bc64-736d0bfd7f16)

![Untitled2](https://github.com/neondatabase/neon/assets/6244849/84199ab7-5f0e-413b-a42b-f728f2225218)

## Summary of changes

It is unnecessary to call `get_layer_for_write` repeatedly for all
values in a WAL message since they would end up in the same memory layer
anyway, so I created the batched versions of `InMemoryLayer::put_value`,
`InMemoryLayer ::put_tombstone`, `Timeline::put_value`, and
`Timeline::put_tombstone`, that acquire the locks once for a batch of
values.

Additionally, `DatadirModification` is changed to store multiple
versions of uncommitted values, and `WalIngest::ingest_record()` can now
ingest records without immediately committing them.

With these new APIs, the new ingestion loop can be changed to commit for
every `ingest_batch_size` records. The `ingest_batch_size` variable is
exposed as a config. If it is set to 1 then we get the same behavior
before this change. I found that setting this value to 100 seems to work
the best, and you can see its effect on the right side of the above
graphs.

---------

Co-authored-by: John Spray <john@neon.tech>
---
 pageserver/src/basebackup.rs                  |  15 +-
 pageserver/src/config.rs                      |  28 +-
 pageserver/src/import_datadir.rs              |   9 +-
 pageserver/src/page_service.rs                |  20 +-
 pageserver/src/pgdatadir_mapping.rs           | 237 ++++++++++-----
 pageserver/src/tenant/config.rs               |   2 +
 .../tenant/storage_layer/inmemory_layer.rs    |  43 ++-
 pageserver/src/tenant/timeline.rs             |  41 ++-
 pageserver/src/tenant/timeline/walreceiver.rs |   1 +
 .../walreceiver/connection_manager.rs         |   3 +
 .../walreceiver/walreceiver_connection.rs     |  36 ++-
 pageserver/src/walingest.rs                   | 272 ++++++++++--------
 12 files changed, 468 insertions(+), 239 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ed452eae7d..7e5ae892ad 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,6 +23,7 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
+use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -174,7 +175,7 @@ where
         ] {
             for segno in self
                 .timeline
-                .list_slru_segments(kind, self.lsn, self.ctx)
+                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
                 .await?
             {
                 self.add_slru_segment(kind, segno).await?;
@@ -192,7 +193,7 @@ where
             // Otherwise only include init forks of unlogged relations.
             let rels = self
                 .timeline
-                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                 .await?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
@@ -267,7 +268,7 @@ where
     async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(src, self.lsn, false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
@@ -288,7 +289,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -310,7 +311,7 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
+            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -352,7 +353,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                 .await?;
 
             ensure!(
@@ -399,7 +400,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                     .await?
                     .is_empty()
             {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8516f397ca..4560f5eca0 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -76,6 +76,8 @@ pub mod defaults {
 
     pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
 
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -88,6 +90,7 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
 
+#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
 
 # initial superuser role name to use when creating a new tenant
@@ -108,6 +111,8 @@ pub mod defaults {
 
 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
 
+#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -233,6 +238,9 @@ pub struct PageServerConf {
     /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
     /// heatmap uploads vs. other remote storage operations.
     pub heatmap_upload_concurrency: usize,
+
+    /// Maximum number of WAL records to be ingested and committed at the same time
+    pub ingest_batch_size: u64,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -314,6 +322,8 @@ struct PageServerConfigBuilder {
     control_plane_emergency_mode: BuilderValue<bool>,
 
     heatmap_upload_concurrency: BuilderValue<usize>,
+
+    ingest_batch_size: BuilderValue<u64>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -386,6 +396,8 @@ impl Default for PageServerConfigBuilder {
             control_plane_emergency_mode: Set(false),
 
             heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+
+            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
         }
     }
 }
@@ -534,6 +546,10 @@ impl PageServerConfigBuilder {
         self.heatmap_upload_concurrency = BuilderValue::Set(value)
     }
 
+    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
+        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -632,10 +648,12 @@ impl PageServerConfigBuilder {
             control_plane_emergency_mode: self
                 .control_plane_emergency_mode
                 .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-
             heatmap_upload_concurrency: self
                 .heatmap_upload_concurrency
                 .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            ingest_batch_size: self
+                .ingest_batch_size
+                .ok_or(anyhow!("missing ingest_batch_size"))?,
         })
     }
 }
@@ -878,6 +896,7 @@ impl PageServerConf {
                 "heatmap_upload_concurrency" => {
                     builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                 },
+                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -949,6 +968,7 @@ impl PageServerConf {
             control_plane_api_token: None,
             control_plane_emergency_mode: false,
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
         }
     }
 }
@@ -1177,7 +1197,8 @@ background_task_maximum_delay = '334 s'
                 control_plane_api: None,
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1238,7 +1259,8 @@ background_task_maximum_delay = '334 s'
                 control_plane_api: None,
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                ingest_batch_size: 100,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index d95d75449d..d66df36b3a 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,6 +21,7 @@ use tracing::*;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -312,13 +313,16 @@ async fn import_wal(
         waldecoder.feed_bytes(&buf);
 
         let mut nrecords = 0;
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(last_lsn);
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
+                WAL_INGEST.records_committed.inc();
+
+                modification.commit(ctx).await?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(
 
         waldecoder.feed_bytes(&bytes[offset..]);
 
-        let mut modification = tline.begin_modification(end_lsn);
+        let mut modification = tline.begin_modification(last_lsn);
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
+                modification.commit(ctx).await?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d5ca7f7382..db07a600e5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
+use crate::pgdatadir_mapping::{rel_block_to_key, Version};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -747,7 +747,7 @@ impl PageServerHandler {
                 .await?;
 
         let exists = timeline
-            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -766,7 +766,9 @@ impl PageServerHandler {
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
+        let n_blocks = timeline
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
@@ -785,7 +787,13 @@ impl PageServerHandler {
                 .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -816,7 +824,7 @@ impl PageServerHandler {
         let key = rel_block_to_key(req.rel, req.blkno);
         let page = if timeline.get_shard_identity().is_key_local(&key) {
             timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
                 .await?
         } else {
             // The Tenant shard we looked up at connection start does not hold this particular
@@ -853,7 +861,7 @@ impl PageServerHandler {
             // the GateGuard was already held over the whole connection.
             let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
             timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
                 .await?
         };
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index e9884a15f5..9fe75e5baf 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Context;
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,6 +147,7 @@ impl Timeline {
     {
         DatadirModification {
             tline: self,
+            pending_lsns: Vec::new(),
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
@@ -163,7 +164,7 @@ impl Timeline {
         &self,
         tag: RelTag,
         blknum: BlockNumber,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
@@ -173,17 +174,20 @@ impl Timeline {
             ));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag, blknum, lsn, nblocks
+                tag,
+                blknum,
+                version.get_lsn(),
+                nblocks
             );
             return Ok(ZERO_PAGE.clone());
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn, ctx).await
+        version.get(self, key, ctx).await
     }
 
     // Get size of a database in blocks
@@ -191,16 +195,16 @@ impl Timeline {
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -210,7 +214,7 @@ impl Timeline {
     pub async fn get_rel_size(
         &self,
         tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
@@ -220,12 +224,12 @@ impl Timeline {
             ));
         }
 
-        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(nblocks);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -235,7 +239,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -246,7 +250,7 @@ impl Timeline {
             // latest=true, then it can not cause cache corruption, because with latest=true
             // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
             // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, lsn, nblocks);
+            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
         }
         Ok(nblocks)
     }
@@ -255,7 +259,7 @@ impl Timeline {
     pub async fn get_rel_exists(
         &self,
         tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
         _latest: bool,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
@@ -266,12 +270,12 @@ impl Timeline {
         }
 
         // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(true);
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -291,12 +295,12 @@ impl Timeline {
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -332,11 +336,11 @@ impl Timeline {
         &self,
         kind: SlruKind,
         segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
@@ -345,12 +349,12 @@ impl Timeline {
         &self,
         kind: SlruKind,
         segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -501,11 +505,11 @@ impl Timeline {
         mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
     ) -> Result<T, PageReconstructError> {
         for segno in self
-            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
             .await?
         {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
@@ -531,13 +535,13 @@ impl Timeline {
     pub async fn list_slru_segments(
         &self,
         kind: SlruKind,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
@@ -548,12 +552,12 @@ impl Timeline {
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
         Ok(buf)
     }
 
@@ -652,7 +656,10 @@ impl Timeline {
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
+                .await?
+            {
                 if self.cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }
@@ -692,7 +699,7 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn, ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
                 .await?
                 .into_iter()
                 .collect();
@@ -799,18 +806,39 @@ pub struct DatadirModification<'a> {
     /// in the state in 'tline' yet.
     pub tline: &'a Timeline,
 
-    /// Lsn assigned by begin_modification
-    pub lsn: Lsn,
+    /// Current LSN of the modification
+    lsn: Lsn,
 
     // The modifications are not applied directly to the underlying key-value store.
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
-    pending_updates: HashMap<Key, Value>,
-    pending_deletions: Vec<Range<Key>>,
+    pending_lsns: Vec<Lsn>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 }
 
 impl<'a> DatadirModification<'a> {
+    /// Get the current lsn
+    pub(crate) fn get_lsn(&self) -> Lsn {
+        self.lsn
+    }
+
+    /// Set the current lsn
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
+        ensure!(
+            lsn >= self.lsn,
+            "setting an older lsn {} than {} is not allowed",
+            lsn,
+            self.lsn
+        );
+        if lsn > self.lsn {
+            self.pending_lsns.push(self.lsn);
+            self.lsn = lsn;
+        }
+        Ok(())
+    }
+
     /// Initialize a completely new repository.
     ///
     /// This inserts the directory metadata entries that are assumed to
@@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> {
         dbnode: Oid,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let req_lsn = self.tline.get_last_record_lsn();
-
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
             .await?;
 
         // Remove entry from dbdir
@@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
+        if self
+            .tline
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .await?
+        {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
             let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> {
         let writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::new();
-        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
-                // This bails out on first error without modifying pending_updates.
-                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value, ctx).await?;
-            } else {
-                retained_pending_updates.insert(key, value);
+        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
+        for (key, values) in self.pending_updates.drain() {
+            for (lsn, value) in values {
+                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                    // This bails out on first error without modifying pending_updates.
+                    // That's Ok, cf this function's doc comment.
+                    writer.put(key, lsn, &value, ctx).await?;
+                } else {
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
+                }
             }
         }
-        self.pending_updates.extend(retained_pending_updates);
+
+        self.pending_updates = retained_pending_updates;
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> {
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let writer = self.tline.writer().await;
-        let lsn = self.lsn;
+
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
-        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value, ctx).await?;
-        }
-        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn).await?;
+        if !self.pending_updates.is_empty() {
+            writer.put_batch(&self.pending_updates, ctx).await?;
+            self.pending_updates.clear();
         }
 
-        writer.finish_write(lsn);
+        if !self.pending_deletions.is_empty() {
+            writer.delete_batch(&self.pending_deletions).await?;
+            self.pending_deletions.clear();
+        }
+
+        self.pending_lsns.push(self.lsn);
+        for pending_lsn in self.pending_lsns.drain(..) {
+            // Ideally, we should be able to call writer.finish_write() only once
+            // with the highest LSN. However, the last_record_lsn variable in the
+            // timeline keeps track of the latest LSN and the immediate previous LSN
+            // so we need to record every LSN to not leave a gap between them.
+            writer.finish_write(pending_lsn);
+        }
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub(crate) fn is_empty(&self) -> bool {
-        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
+    pub(crate) fn len(&self) -> usize {
+        self.pending_updates.len() + self.pending_deletions.len()
     }
 
     // Internal helper functions to batch the modifications
 
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the pending updated
+        // Have we already updated the same key? Read the latest pending updated
         // version in that case.
         //
         // Note: we don't check pending_deletions. It is an error to request a
         // value that has been removed, deletion only avoids leaking storage.
-        if let Some(value) = self.pending_updates.get(&key) {
-            if let Value::Image(img) = value {
-                Ok(img.clone())
-            } else {
-                // Currently, we never need to read back a WAL record that we
-                // inserted in the same "transaction". All the metadata updates
-                // work directly with Images, and we never need to read actual
-                // data pages. We could handle this if we had to, by calling
-                // the walredo manager, but let's keep it simple for now.
-                Err(PageReconstructError::from(anyhow::anyhow!(
-                    "unexpected pending WAL record"
-                )))
+        if let Some(values) = self.pending_updates.get(&key) {
+            if let Some((_, value)) = values.last() {
+                return if let Value::Image(img) = value {
+                    Ok(img.clone())
+                } else {
+                    // Currently, we never need to read back a WAL record that we
+                    // inserted in the same "transaction". All the metadata updates
+                    // work directly with Images, and we never need to read actual
+                    // data pages. We could handle this if we had to, by calling
+                    // the walredo manager, but let's keep it simple for now.
+                    Err(PageReconstructError::from(anyhow::anyhow!(
+                        "unexpected pending WAL record"
+                    )))
+                };
             }
-        } else {
-            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn, ctx).await
         }
+        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+        self.tline.get(key, lsn, ctx).await
     }
 
     fn put(&mut self, key: Key, val: Value) {
-        self.pending_updates.insert(key, val);
+        let values = self.pending_updates.entry(key).or_default();
+        // Replace the previous value if it exists at the same lsn
+        if let Some((last_lsn, last_value)) = values.last_mut() {
+            if *last_lsn == self.lsn {
+                *last_value = val;
+                return;
+            }
+        }
+        values.push((self.lsn, val));
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
         trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push(key_range);
+        self.pending_deletions.push((key_range, self.lsn));
+    }
+}
+
+/// This struct facilitates accessing either a committed key from the timeline at a
+/// specific LSN, or the latest uncommitted key from a pending modification.
+/// During WAL ingestion, the records from multiple LSNs may be batched in the same
+/// modification before being flushed to the timeline. Hence, the routines in WalIngest
+/// need to look up the keys in the modification first before looking them up in the
+/// timeline to not miss the latest updates.
+#[derive(Clone, Copy)]
+pub enum Version<'a> {
+    Lsn(Lsn),
+    Modified(&'a DatadirModification<'a>),
+}
+
+impl<'a> Version<'a> {
+    async fn get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        match self {
+            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
+            Version::Modified(modification) => modification.get(key, ctx).await,
+        }
+    }
+
+    fn get_lsn(&self) -> Lsn {
+        match self {
+            Version::Lsn(lsn) => *lsn,
+            Version::Modified(modification) => modification.lsn,
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 25d97f51ce..2d4cd350d7 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -46,6 +46,8 @@ pub mod defaults {
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 003cf0e92b..7c9103eea8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};
 
 use super::{DeltaLayerWriter, ResidentLayer};
 
@@ -246,16 +246,43 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
         val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let inner: &mut _ = &mut *self.inner.write().await;
+        let mut inner = self.inner.write().await;
         self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
             // Avoid doing allocations for "small" values.
@@ -264,7 +291,7 @@ impl InMemoryLayer {
             let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
             buf.clear();
             val.ser_into(&mut buf)?;
-            inner
+            locked_inner
                 .file
                 .write_blob(
                     &buf,
@@ -275,7 +302,7 @@ impl InMemoryLayer {
                 .await?
         };
 
-        let vec_map = inner.index.entry(key).or_default();
+        let vec_map = locked_inner.index.entry(key).or_default();
         let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
         if old.is_some() {
             // We already had an entry for this LSN. That's odd..
@@ -285,13 +312,11 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
         // TODO: Currently, we just leak the storage for any deleted keys
-
         Ok(())
     }
 
-    /// Make the layer non-writeable. Only call once.
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub async fn freeze(&self, end_lsn: Lsn) {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1e84fa1848..15a5ca1727 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1459,6 +1459,7 @@ impl Timeline {
                 max_lsn_wal_lag,
                 auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                 availability_zone: self.conf.availability_zone.clone(),
+                ingest_batch_size: self.conf.ingest_batch_size,
             },
             broker_client,
             ctx,
@@ -2471,9 +2472,27 @@ impl Timeline {
         Ok(())
     }
 
-    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_tombstone(key_range, lsn).await?;
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
         Ok(())
     }
 
@@ -4529,8 +4548,16 @@ impl<'a> TimelineWriter<'a> {
         self.tl.put_value(key, lsn, value, ctx).await
     }
 
-    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        self.tl.put_tombstone(key_range, lsn).await
+    pub(crate) async fn put_batch(
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.tl.put_values(batch, ctx).await
+    }
+
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
     }
 
     /// Track the end of the latest digested WAL record.
@@ -4541,11 +4568,11 @@ impl<'a> TimelineWriter<'a> {
     /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
     /// the 'lsn' or anything older. The previous last record LSN is stored alongside
     /// the latest and can be read.
-    pub fn finish_write(&self, new_lsn: Lsn) {
+    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         self.tl.finish_write(new_lsn);
     }
 
-    pub fn update_current_logical_size(&self, delta: i64) {
+    pub(crate) fn update_current_logical_size(&self, delta: i64) {
         self.tl.update_current_logical_size(delta)
     }
 }
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index e32265afb5..2fab6722b8 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -58,6 +58,7 @@ pub struct WalReceiverConf {
     pub max_lsn_wal_lag: NonZeroU64,
     pub auth_token: Option<Arc<String>>,
     pub availability_zone: Option<String>,
+    pub ingest_batch_size: u64,
 }
 
 pub struct WalReceiver {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 5a5b3d7586..7fa5bb7689 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -411,6 +411,7 @@ impl ConnectionManagerState {
 
         let node_id = new_sk.safekeeper_id;
         let connect_timeout = self.conf.wal_connect_timeout;
+        let ingest_batch_size = self.conf.ingest_batch_size;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
@@ -430,6 +431,7 @@ impl ConnectionManagerState {
                     connect_timeout,
                     ctx,
                     node_id,
+                    ingest_batch_size,
                 )
                 .await;
 
@@ -1345,6 +1347,7 @@ mod tests {
                 max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
                 auth_token: None,
                 availability_zone: None,
+                ingest_batch_size: 1,
             },
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 61ab236322..e398d683e5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
@@ -106,6 +106,7 @@ impl From<WalDecodeError> for WalReceiverError {
 
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
+#[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
     timeline: Arc<Timeline>,
     wal_source_connconf: PgConnectionConfig,
@@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection(
     connect_timeout: Duration,
     ctx: RequestContext,
     node: NodeId,
+    ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection(
 
                 {
                     let mut decoded = DecodedWALRecord::default();
-                    let mut modification = timeline.begin_modification(endlsn);
+                    let mut modification = timeline.begin_modification(startlsn);
+                    let mut uncommitted_records = 0;
+                    let mut filtered_records = 0;
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
@@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
-                        walingest
+                        // Ingest the records without immediately committing them.
+                        let ingested = walingest
                             .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
+                        if !ingested {
+                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+                            WAL_INGEST.records_filtered.inc();
+                            filtered_records += 1;
+                        }
 
                         fail_point!("walreceiver-after-ingest");
 
                         last_rec_lsn = lsn;
+
+                        // Commit every ingest_batch_size records. Even if we filtered out
+                        // all records, we still need to call commit to advance the LSN.
+                        uncommitted_records += 1;
+                        if uncommitted_records >= ingest_batch_size {
+                            WAL_INGEST
+                                .records_committed
+                                .inc_by(uncommitted_records - filtered_records);
+                            modification.commit(&ctx).await?;
+                            uncommitted_records = 0;
+                            filtered_records = 0;
+                        }
+                    }
+
+                    // Commit the remaining records.
+                    if uncommitted_records > 0 {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(uncommitted_records - filtered_records);
+                        modification.commit(&ctx).await?;
                     }
                 }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index a6a8972970..8df0c81c7a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -48,20 +48,18 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;
 
-pub struct WalIngest<'a> {
+pub struct WalIngest {
     shard: ShardIdentity,
-    timeline: &'a Timeline,
-
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
 }
 
-impl<'a> WalIngest<'a> {
+impl WalIngest {
     pub async fn new(
-        timeline: &'a Timeline,
+        timeline: &Timeline,
         startpoint: Lsn,
-        ctx: &'_ RequestContext,
-    ) -> anyhow::Result<WalIngest<'a>> {
+        ctx: &RequestContext,
+    ) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -70,7 +68,6 @@ impl<'a> WalIngest<'a> {
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
-            timeline,
             checkpoint,
             checkpoint_modified: false,
         })
@@ -84,6 +81,8 @@ impl<'a> WalIngest<'a> {
     /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
     /// relations/pages that the record affects.
     ///
+    /// This function returns `true` if the record was ingested, and `false` if it was filtered out
+    ///
     pub async fn ingest_record(
         &mut self,
         recdata: Bytes,
@@ -91,11 +90,13 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
+        let pg_version = modification.tline.pg_version;
+        let prev_len = modification.len();
 
-        modification.lsn = lsn;
-        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
+        modification.set_lsn(lsn)?;
+        decode_wal_record(recdata, decoded, pg_version)?;
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -132,9 +133,9 @@ impl<'a> WalIngest<'a> {
             }
             pg_constants::RM_DBASE_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
+                debug!(%info, %pg_version, "handle RM_DBASE_ID");
 
-                if self.timeline.pg_version == 14 {
+                if pg_version == 14 {
                     if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
                         let createdb = XlCreateDatabase::decode(&mut buf);
                         debug!("XLOG_DBASE_CREATE v14");
@@ -150,7 +151,7 @@ impl<'a> WalIngest<'a> {
                                 .await?;
                         }
                     }
-                } else if self.timeline.pg_version == 15 {
+                } else if pg_version == 15 {
                     if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                         debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                     } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -170,7 +171,7 @@ impl<'a> WalIngest<'a> {
                                 .await?;
                         }
                     }
-                } else if self.timeline.pg_version == 16 {
+                } else if pg_version == 16 {
                     if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                         debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                     } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -399,19 +400,11 @@ impl<'a> WalIngest<'a> {
             self.checkpoint_modified = false;
         }
 
-        if modification.is_empty() {
-            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-            WAL_INGEST.records_filtered.inc();
-            modification.tline.finish_write(lsn);
-        } else {
-            WAL_INGEST.records_committed.inc();
-            modification.commit(ctx).await?;
-        }
+        // Note that at this point this record is only cached in the modification
+        // until commit() is called to flush the data into the repository and update
+        // the latest LSN.
 
-        // Now that this record has been fully handled, including updating the
-        // checkpoint data, let the repository know that it is up-to-date to this LSN.
-
-        Ok(())
+        Ok(modification.len() > prev_len)
     }
 
     /// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -458,7 +451,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
             // do not materialize null pages because them most likely be soon replaced with real data
             && blk.bimg_len != 0
         {
@@ -511,7 +504,7 @@ impl<'a> WalIngest<'a> {
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
 
-        match self.timeline.pg_version {
+        match modification.tline.pg_version {
             14 => {
                 if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                     let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -735,7 +728,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -816,10 +809,11 @@ impl<'a> WalIngest<'a> {
         let mut new_heap_blkno: Option<u32> = None;
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+        let pg_version = modification.tline.pg_version;
 
         assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
 
-        match self.timeline.pg_version {
+        match pg_version {
             16 => {
                 let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
 
@@ -882,7 +876,7 @@ impl<'a> WalIngest<'a> {
             }
             _ => bail!(
                 "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                self.timeline.pg_version
+                pg_version
             ),
         }
 
@@ -905,7 +899,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -983,16 +977,14 @@ impl<'a> WalIngest<'a> {
         let src_db_id = rec.src_db_id;
         let src_tablespace_id = rec.src_tablespace_id;
 
-        // Creating a database is implemented by copying the template (aka. source) database.
-        // To copy all the relations, we need to ask for the state as of the same LSN, but we
-        // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
-        // the last valid LSN to advance up to it. So we use the previous record's LSN in the
-        // get calls instead.
-        let req_lsn = modification.tline.get_last_record_lsn();
-
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
+            .list_rels(
+                src_tablespace_id,
+                src_db_id,
+                Version::Modified(modification),
+                ctx,
+            )
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -1000,7 +992,12 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
+            .get_relmap_file(
+                src_tablespace_id,
+                src_db_id,
+                Version::Modified(modification),
+                ctx,
+            )
             .await?;
         modification
             .put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1014,7 +1011,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true, ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -1032,7 +1029,13 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        true,
+                        ctx,
+                    )
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1103,7 +1106,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
+            let nblocks = get_relsize(modification, rel, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1125,7 +1128,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
+            let nblocks = get_relsize(modification, rel, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1198,10 +1201,9 @@ impl<'a> WalIngest<'a> {
                     dbnode: xnode.dbnode,
                     relnode: xnode.relnode,
                 };
-                let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true, ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
                     .await?
                 {
                     self.put_rel_drop(modification, rel, ctx).await?;
@@ -1255,10 +1257,9 @@ impl<'a> WalIngest<'a> {
         // will block waiting for the last valid LSN to advance up to
         // it. So we use the previous record's LSN in the get calls
         // instead.
-        let req_lsn = modification.tline.get_last_record_lsn();
         for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
             .await?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1470,20 +1471,6 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    async fn get_relsize(
-        &mut self,
-        rel: RelTag,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<BlockNumber> {
-        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
-            0
-        } else {
-            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
-        };
-        Ok(nblocks)
-    }
-
     async fn handle_rel_extend(
         &mut self,
         modification: &mut DatadirModification<'_>,
@@ -1495,7 +1482,6 @@ impl<'a> WalIngest<'a> {
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
-        let last_lsn = modification.lsn;
 
         // Get current size and put rel creation if rel doesn't exist
         //
@@ -1503,11 +1489,14 @@ impl<'a> WalIngest<'a> {
         //       check the cache too. This is because eagerly checking the cache results in
         //       less work overall and 10% better performance. It's more work on cache miss
         //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
+        let old_nblocks = if let Some(nblocks) = modification
+            .tline
+            .get_cached_rel_size(&rel, modification.get_lsn())
+        {
             nblocks
-        } else if !self
-            .timeline
-            .get_rel_exists(rel, last_lsn, true, ctx)
+        } else if !modification
+            .tline
+            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1517,7 +1506,10 @@ impl<'a> WalIngest<'a> {
                 .context("Relation Error")?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
+            modification
+                .tline
+                .get_rel_size(rel, Version::Modified(modification), true, ctx)
+                .await?
         };
 
         if new_nblocks > old_nblocks {
@@ -1570,10 +1562,9 @@ impl<'a> WalIngest<'a> {
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
-        let last_lsn = self.timeline.get_last_record_lsn();
-        let old_nblocks = if !self
-            .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
+        let old_nblocks = if !modification
+            .tline
+            .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1582,8 +1573,9 @@ impl<'a> WalIngest<'a> {
                 .await?;
             0
         } else {
-            self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn, ctx)
+            modification
+                .tline
+                .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
                 .await?
         };
 
@@ -1606,6 +1598,26 @@ impl<'a> WalIngest<'a> {
     }
 }
 
+async fn get_relsize(
+    modification: &DatadirModification<'_>,
+    rel: RelTag,
+    ctx: &RequestContext,
+) -> anyhow::Result<BlockNumber> {
+    let nblocks = if !modification
+        .tline
+        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        .await?
+    {
+        0
+    } else {
+        modification
+            .tline
+            .get_rel_size(rel, Version::Modified(modification), true, ctx)
+            .await?
+    };
+    Ok(nblocks)
+}
+
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
@@ -1632,10 +1644,7 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test<'a>(
-        tline: &'a Timeline,
-        ctx: &RequestContext,
-    ) -> Result<WalIngest<'a>> {
+    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1680,29 +1689,29 @@ mod tests {
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
             .await
             .is_err());
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             1
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             3
         );
@@ -1710,46 +1719,46 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
@@ -1765,19 +1774,19 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
@@ -1785,13 +1794,13 @@ mod tests {
         // should still see the truncated block with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             3
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
@@ -1804,7 +1813,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
                 .await?,
             0
         );
@@ -1817,19 +1826,19 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1842,21 +1851,21 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             1501
         );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1883,13 +1892,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             1
         );
@@ -1902,7 +1911,7 @@ mod tests {
         // Check that rel is not visible anymore
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
             false
         );
@@ -1920,13 +1929,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             1
         );
@@ -1959,24 +1968,24 @@ mod tests {
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             relsize
         );
@@ -1987,7 +1996,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2004,7 +2013,7 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             1
         );
@@ -2014,7 +2023,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2023,7 +2032,7 @@ mod tests {
         // should still see all blocks with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             relsize
         );
@@ -2032,7 +2041,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2052,13 +2061,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             relsize
         );
@@ -2068,7 +2077,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2101,7 +2110,9 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE + 1
         );
 
@@ -2113,7 +2124,9 @@ mod tests {
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -2126,7 +2139,9 @@ mod tests {
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -2142,7 +2157,9 @@ mod tests {
                 .await?;
             m.commit(&ctx).await?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+                tline
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                    .await?,
                 size as BlockNumber
             );
 
@@ -2179,7 +2196,7 @@ mod tests {
         let wal_segment_path = format!("{path}/000000010000000000000001.zst");
         let source_initdb_path = format!("{path}/{INITDB_PATH}");
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
-        let endpoint = Lsn::from_hex("1FFFF98").unwrap();
+        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
         let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
         let (tenant, ctx) = harness.load().await;
@@ -2221,7 +2238,7 @@ mod tests {
         let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
             .await
             .unwrap();
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(startpoint);
         let mut decoded = DecodedWALRecord::default();
         println!("decoding {} bytes", bytes.len() - xlogoff);
 
@@ -2235,6 +2252,7 @@ mod tests {
                     .await
                     .unwrap();
             }
+            modification.commit(&ctx).await.unwrap();
         }
 
         let duration = started_at.elapsed();

From 673a86505594d816b0eea2560a797291db8ed4bd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jan 2024 11:50:58 +0000
Subject: [PATCH 191/202] tests: tolerate 304 when evicting layers (#6261)

In tests that evict layers, explicit eviction can race with automatic
eviction of the same layer and result in a 304
---
 test_runner/fixtures/pageserver/http.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index add6c4288a..6dea0d923d 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -714,7 +714,7 @@ class PageserverHttpClient(requests.Session):
         )
         self.verbose_error(res)
 
-        assert res.status_code == 200
+        assert res.status_code in (200, 304)
 
     def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
         info = self.layer_map_info(tenant_id, timeline_id)

From 17b256679bb84c60401c801a779ecef41a395e00 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 3 Jan 2024 13:02:04 +0000
Subject: [PATCH 192/202] vm-image-spec: build pgbouncer from Neon's fork
 (#6249)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

We need to add one more patch to pgbouncer (for
https://github.com/neondatabase/neon/issues/5801). I've decided to
cherry-pick all required patches to a pgbouncer fork
(`neondatabase/pgbouncer`) and use it instead.

See
https://github.com/neondatabase/pgbouncer/releases/tag/pgbouncer_1_21_0-neon-1

## Summary of changes
- Revert the previous patch (for deallocate/discard all) — the fork
already contains it.
- Remove `libssl-dev` dependency — we build pgbouncer without `openssl`
support.
- Clone git tag and build pgbouncer from source code.
---
 vm-image-spec.yaml | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 68be0b3617..704e3721d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -167,22 +167,21 @@ build: |
       && apt-get update \
       && apt-get install -y \
           build-essential \
-          curl \
+          git \
           libevent-dev \
-          libssl-dev \
-          patchutils \
+          libtool \
           pkg-config
 
-  ENV PGBOUNCER_VERSION 1.21.0
-  ENV PGBOUNCER_GITPATH 1_21_0
+  # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits.
+  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
+  ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1
   RUN set -e \
-      && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
-      && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
-      && cd pgbouncer-${PGBOUNCER_VERSION} \
-      && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \
+      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \
+      && cd pgbouncer \
+      && ./autogen.sh \
       && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
-      && make -j $(nproc) \
-      && make install
+      && make -j $(nproc) dist_man_MANS= \
+      && make install dist_man_MANS=
 merge: |
   # tweak nofile limits
   RUN set -e \

From 65b4e6e7d6a89c1217c4c08c8118e252d3e17809 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 2 Jan 2024 11:24:52 +0300
Subject: [PATCH 193/202] Remove empty safekeeper init since truncateLsn.

It has caveats such as creating half empty segment which can't be
offloaded. Instead we'll pursue approach of pull_timeline, seeding new state
from some peer.
---
 pgxn/neon/walproposer.c                  | 34 ++++--------------------
 test_runner/regress/test_wal_acceptor.py |  2 +-
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 2ea724f927..1f7c473e7d 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-
-		if (sk->startStreamingAt < wp->truncateLsn)
-		{
-			/*
-			 * There's a gap between the WAL starting point and a truncateLsn,
-			 * which can't appear in a normal working cluster. That gap means
-			 * that all safekeepers reported that they have persisted WAL up
-			 * to the truncateLsn before, but now current safekeeper tells
-			 * otherwise.
-			 *
-			 * Also we have a special condition here, which is empty
-			 * safekeeper with no history. In combination with a gap, that can
-			 * happen when we introduce a new safekeeper to the cluster. This
-			 * is a rare case, which is triggered manually for now, and should
-			 * be treated with care.
-			 */
-
-			/*
-			 * truncateLsn will not change without ack from current
-			 * safekeeper, and it's aligned to the WAL record, so we can
-			 * safely start streaming from this point.
-			 */
-			sk->startStreamingAt = wp->truncateLsn;
-
-			wp_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-				   sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-				   LSN_FORMAT_ARGS(sk->startStreamingAt));
-		}
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
+		 	 sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
+		/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
+		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
 	}
 	else
 	{
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}
 
-	Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
+	Assert(sk->startStreamingAt <= wp->availableLsn);
 
 	msg.tag = 'e';
 	msg.term = wp->propTerm;
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cf8df389c8..4dfc883f4c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
         f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
     )
 
-    endpoint.stop_and_destroy()
+    endpoint.stop()
     timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
 
     # Also delete and manually create timeline on safekeepers -- this tests

From edc962f1d7ac66bfb8dfe8f63d253d9389fc7b5c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jan 2024 14:22:17 +0000
Subject: [PATCH 194/202] test_runner: test_issue_5878 log allow list (#6259)

## Problem


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6254/7388706419/index.html#suites/5a4b8734277a9878cb429b80c314f470/e54c4f6f6ed22672

## Summary of changes

Permit the log message: because the test helper's detach function
increments the generation number, a detach/attach cycle can cause the
error if the test runner node is slow enough for the opportunistic
deletion queue flush on detach not to complete by the time we call
attach.
---
 test_runner/regress/test_layers_from_future.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 340188c1ae..999e077e45 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
 
     ps_http = env.pageserver.http_client()
 

From 753d97bd77f6d2c87f7167133585554eb463f01b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 10:12:34 +0000
Subject: [PATCH 195/202] pageserver: don't delete ancestor shard layers

---
 .../src/tenant/remote_timeline_client.rs      | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1b0cf39fbe..60b40d70a7 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -818,8 +818,25 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
     ) {
+        // Filter out any layers which were not created by this tenant shard.  These are
+        // layers that originate from some ancestor shard after a split, and may still
+        // be referenced by other shards. We are free to delete them locally and remove
+        // them from our index (and would have already done so when we reach this point
+        // in the code), but we may not delete them remotely.
+        with_metadata.retain(|(name, meta)| {
+            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
+                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
+            if !retain {
+                tracing::debug!(
+                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
+                    meta.shard
+                );
+            }
+            retain
+        });
+
         for (name, meta) in &with_metadata {
             info!(
                 "scheduling deletion of layer {}{} (shard {})",

From 6c79e12630b1330e838af8eb07346550a8bf7912 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 27 Dec 2023 15:57:17 +0000
Subject: [PATCH 196/202] pageserver: drop unwanted keys during compaction
 after split

---
 libs/pageserver_api/src/keyspace.rs |  3 +++
 libs/pageserver_api/src/shard.rs    | 15 +++++++++++++
 pageserver/src/tenant/timeline.rs   | 33 ++++++++++++++++++++++++-----
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 16651c322e..80183506d8 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -124,6 +124,9 @@ impl KeySpaceAccum {
                 if range.start == accum.end {
                     accum.end = range.end;
                 } else {
+                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
+                    // a new range here if the skipped region was all keys that don't belong on this shard.
+                    // (https://github.com/neondatabase/neon/issues/6247)
                     assert!(range.start > accum.end);
                     self.ranges.push(accum.clone());
                     *accum = range;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3e4936eec4..a186d93bce 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -422,6 +422,21 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be discarded if found in this shard's
+    /// data store, e.g. during compaction after a split
+    pub fn is_key_disposable(&self, key: &Key) -> bool {
+        if key_is_shard0(key) {
+            // Q: Why can't we dispose of shard0 content if we're not shard 0?
+            // A: because the WAL ingestion logic currently ingests some shard 0
+            //    content on all shards, even though it's only read on shard 0.  If we
+            //    dropped it, then subsequent WAL ingest to these keys would encounter
+            //    an error.
+            false
+        } else {
+            !self.is_key_local(key)
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 15a5ca1727..e8340a74b2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -496,6 +496,11 @@ impl Timeline {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
         // XXX: structured stats collection for layer eviction here.
         trace!(
             "get page request for {}@{} from task kind {:?}",
@@ -2224,13 +2229,13 @@ impl Timeline {
                     return Err(layer_traversal_error(
                         if cfg!(test) {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
-                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                             )
                         } else {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}",
-                                key, cont_lsn, request_lsn
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
                             )
                         },
                         traversal_path,
@@ -3054,6 +3059,15 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
+                        if self.shard_identity.is_key_disposable(&key) {
+                            debug!(
+                                "Dropping key {} during compaction (it belongs on shard {:?})",
+                                key,
+                                self.shard_identity.get_shard_number(&key)
+                            );
+                            key = key.next();
+                            continue;
+                        }
                         let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
@@ -3080,6 +3094,7 @@ impl Timeline {
                                 }
                             }
                         };
+
                         image_layer_writer.put_image(key, &img).await?;
                         key = key.next();
                     }
@@ -3650,7 +3665,15 @@ impl Timeline {
                 )))
             });
 
-            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            if !self.shard_identity.is_key_disposable(&key) {
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
 
             if !new_layers.is_empty() {
                 fail_point!("after-timeline-compacted-first-L1");

From ef7c9c2ccc1a385f74455f45b54faa5b101065e6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 13:58:00 +0000
Subject: [PATCH 197/202] pageserver: fix active tenant lookup hitting
 secondaries with sharding

If there is some secondary shard for a tenant on the same
node as an attached shard, the secondary shard could trip up
this code and cause page_service to incorrectly
get an error instead of finding the attached shard.
---
 pageserver/src/tenant/mgr.rs | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 62922e8c99..250de7247d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -130,7 +130,7 @@ impl TenantsMap {
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
     /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
+    fn resolve_attached_shard(
         &self,
         tenant_id: &TenantId,
         selector: ShardSelector,
@@ -140,25 +140,27 @@ impl TenantsMap {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                 for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        _ => continue,
+                    };
+
                     match selector {
                         ShardSelector::First => return Some(*slot.0),
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                             return Some(*slot.0)
                         }
                         ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
 
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return Some(*slot.0);
                             }
                         }
                         _ => continue,
@@ -1257,9 +1259,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
         let locked = TENANTS.read().unwrap();
 
         // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
+        let tenant_shard_id = locked
+            .resolve_attached_shard(&tenant_id, shard_selector)
+            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                tenant_id,
+            )))?;
 
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
             .map_err(GetTenantError::MapState)?;

From 34ebfbdd6f509f4bd2eab807c2730f987ba5b0df Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Dec 2023 15:13:00 +0000
Subject: [PATCH 198/202] pageserver: fix handling getpage with multiple shards
 on one node

Previously, we would wait for the LSN to be visible on whichever
timeline we happened to load at the start of the connection, then
proceed to look up the correct timeline for the key and do the read.

If the timeline holding the key was behind the timeline we used
for the LSN wait, then we might serve an apparently-successful read result
that actually contains data from behind the requested lsn.
---
 pageserver/src/page_service.rs | 41 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index db07a600e5..be9f478f25 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -802,7 +802,7 @@ impl PageServerHandler {
         }))
     }
 
-    async fn handle_get_page_at_lsn_request(
+    async fn do_handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
@@ -812,20 +812,25 @@ impl PageServerHandler {
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
-        /*
-        // Add a 1s delay to some requests. The delay helps the requests to
-        // hit the race condition from github issue #1047 more easily.
-        use rand::Rng;
-        if rand::thread_rng().gen::<u8>() < 5 {
-            std::thread::sleep(std::time::Duration::from_millis(1000));
-        }
-        */
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
 
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
+    }
+
+    async fn handle_get_page_at_lsn_request(
+        &self,
+        timeline: &Timeline,
+        req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-                .await?
+        if timeline.get_shard_identity().is_key_local(&key) {
+            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
+                .await
         } else {
             // The Tenant shard we looked up at connection start does not hold this particular
             // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -860,14 +865,10 @@ impl PageServerHandler {
             // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
             // the GateGuard was already held over the whole connection.
             let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-                .await?
-        };
 
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
+            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
+                .await
+        }
     }
 
     #[allow(clippy::too_many_arguments)]

From 73a944205b5d76efd791a3218004423204790789 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 11:52:09 +0000
Subject: [PATCH 199/202] pageserver: log details on shard routing error

---
 pageserver/src/page_service.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index be9f478f25..d478d375f8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -857,6 +857,8 @@ impl PageServerHandler {
                     // informed yet.
                     //
                     // https://github.com/neondatabase/neon/issues/6038
+                    tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
+                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
                     return Err(anyhow::anyhow!("Request routed to wrong shard"));
                 }
                 Err(e) => return Err(e.into()),

From a2e083ebe04fd0283c26dba8d7fd9e981623aad9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Dec 2023 13:43:56 +0000
Subject: [PATCH 200/202] pageserver: make walredo shard-aware

This does not have a functional impact, but enables all
the logging in this code to include the shard_id
label.
---
 pageserver/benches/bench_walredo.rs |  5 ++-
 pageserver/src/tenant.rs            |  6 +--
 pageserver/src/walredo.rs           | 64 ++++++++++++++++-------------
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ba41866935..4837626086 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
 use pageserver::{
     config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
+use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};
 
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
-    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-    let manager = PostgresRedoManager::new(conf, tenant_id);
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
     let manager = Arc::new(manager);
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e50987c84b..1660de8923 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -595,10 +595,9 @@ impl Tenant {
         mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
 
         let TenantSharedResources {
@@ -1145,10 +1144,9 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         reason: String,
     ) -> Arc<Tenant> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
         Arc::new(Tenant::new(
             TenantState::Broken {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 94e95fd3b3..6918698f29 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
 
 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-#[cfg(feature = "testing")]
-use pageserver_api::shard::TenantShardId;
-
 use crate::config::PageServerConf;
 use crate::metrics::{
     WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -92,7 +90,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
     redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
     ///
     /// Create a new PostgresRedoManager.
     ///
-    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
+    pub fn new(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+    ) -> PostgresRedoManager {
         // The actual process is launched lazily, on first request.
         PostgresRedoManager {
-            tenant_id,
+            tenant_shard_id,
             conf,
             last_redo_at: std::sync::Mutex::default(),
             redo_process: RwLock::new(None),
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
                                 let timer =
                                     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                 let proc = Arc::new(
-                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
-                                        .context("launch walredo process")?,
+                                    WalRedoProcess::launch(
+                                        self.conf,
+                                        self.tenant_shard_id,
+                                        pg_version,
+                                    )
+                                    .context("launch walredo process")?,
                                 );
                                 timer.observe_duration();
                                 *proc_guard = Some(Arc::clone(&proc));
@@ -638,7 +643,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     // Some() on construction, only becomes None on Drop.
     child: Option<NoLeakChild>,
     stdout: Mutex<ProcessOutput>,
@@ -652,10 +657,10 @@ impl WalRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
     fn launch(
         conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
         let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -680,7 +685,7 @@ impl WalRedoProcess {
             // as close-on-exec by default, but that's not enough, since we use
             // libraries that directly call libc open without setting that flag.
             .close_fds()
-            .spawn_no_leak_child(tenant_id)
+            .spawn_no_leak_child(tenant_shard_id)
             .context("spawn process")?;
         WAL_REDO_PROCESS_COUNTERS.started.inc();
         let mut child = scopeguard::guard(child, |child| {
@@ -741,12 +746,12 @@ impl WalRedoProcess {
                         error!(error=?e, "failed to read from walredo stderr");
                     }
                 }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
         );
 
         Ok(Self {
             conf,
-            tenant_id,
+            tenant_shard_id,
             child: Some(child),
             stdin: Mutex::new(ProcessInput {
                 stdin,
@@ -772,7 +777,7 @@ impl WalRedoProcess {
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
     fn apply_wal_records(
         &self,
         tag: BufferTag,
@@ -966,11 +971,7 @@ impl WalRedoProcess {
         // these files will be collected to an allure report
         let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
 
-        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
-        let path = self
-            .conf
-            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
-            .join(&filename);
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
 
         let res = std::fs::OpenOptions::new()
             .write(true)
@@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantId,
+    tenant_id: TenantShardId,
     child: Option<Child>,
 }
 
@@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild {
 }
 
 impl NoLeakChild {
-    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
         let child = command.spawn()?;
         Ok(NoLeakChild {
             tenant_id,
@@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild {
             Some(child) => child,
             None => return,
         };
-        let tenant_id = self.tenant_id;
+        let tenant_shard_id = self.tenant_id;
         // Offload the kill+wait of the child process into the background.
         // If someone stops the runtime, we'll leak the child process.
         // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild {
             tokio::task::spawn_blocking(move || {
                 // Intentionally don't inherit the tracing context from whoever is dropping us.
                 // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!("walredo", %tenant_id);
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
                 let _entered = span.enter();
                 Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
             })
@@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild {
 }
 
 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
 }
 
 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
         NoLeakChild::spawn(tenant_id, self)
     }
 }
@@ -1155,6 +1160,7 @@ mod tests {
     use crate::repository::Key;
     use crate::{config::PageServerConf, walrecord::NeonWalRecord};
     use bytes::Bytes;
+    use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
     use utils::{id::TenantId, lsn::Lsn};
 
@@ -1264,9 +1270,9 @@ mod tests {
             let repo_dir = camino_tempfile::tempdir()?;
             let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
             let conf = Box::leak(Box::new(conf));
-            let tenant_id = TenantId::generate();
+            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-            let manager = PostgresRedoManager::new(conf, tenant_id);
+            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
             Ok(RedoHarness {
                 _repo_dir: repo_dir,

From c119af8ddd019a5eebf37d28ce54c4424a1faab2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 27 Dec 2023 16:02:09 +0000
Subject: [PATCH 201/202] pageserver: run at least 2 background task threads

Otherwise an assertion in CONCURRENT_BACKGROUND_TASKS will
trip if you try to run the pageserver on a single core.
---
 pageserver/src/task_mgr.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index cb1b2b8011..eabb1d0022 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
     // else, but that has not been needed in a long time.
     std::env::var("TOKIO_WORKER_THREADS")
         .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
 });
 
 #[derive(Debug, Clone, Copy)]

From 7662df6ca0c5c0977571cb1fcae3746e6af4fbae Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Wed, 3 Jan 2024 11:36:38 -0800
Subject: [PATCH 202/202] Fix minimum backoff to 1ms

---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index e58c28d7d5..3a7c0f1bb6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,7 +35,7 @@
 
 #define PageStoreTrace DEBUG5
 
-#define MIN_RECONNECT_INTERVAL_USEC 100
+#define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;