From fad9be459883467310bdd08d2f336ad3ce9deb80 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Mar 2024 08:56:55 +0000
Subject: [PATCH 01/52] pageserver: mention key in walredo errors (#6988)

## Problem

- Walredo errors, e.g. during image creation, mention the LSN affected
but not the key.

## Summary of changes

- Add key to "error applying ... WAL records" log message
---
 pageserver/src/walredo.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 35cbefb92c..0004f4f3c9 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -262,7 +262,7 @@ impl PostgresRedoManager {
             // next request will launch a new one.
             if let Err(e) = result.as_ref() {
                 error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
                     records.len(),
                     records.first().map(|p| p.0).unwrap_or(Lsn(0)),
                     records.last().map(|p| p.0).unwrap_or(Lsn(0)),

From 8dc7dc79dd493f81e78f2afd37c1fe8a1d79afaa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Mar 2024 09:10:04 +0000
Subject: [PATCH 02/52] tests: debugging for `test_secondary_downloads`
 failures (#6984)

## Problem

- #6966
- Existing logs aren't pointing to a cause: it looks like heatmap upload
and download are happening, but for some reason the evicted layer isn't
removed on the secondary location.

## Summary of changes

- Assert evicted layer is gone from heatmap before checking its gone
from local disk: this will give clarity on whether the issue is with the
uploads or downloads.
- On assertion failures, log the contents of heatmap.
---
 test_runner/fixtures/remote_storage.py        | 10 +++++
 .../regress/test_pageserver_secondary.py      | 41 ++++++++++++++-----
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 4a692688e0..60591d8d46 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,6 +252,16 @@ class S3Storage:
 
         log.info(f"deleted {cnt} objects from remote storage")
 
+    def tenant_path(self, tenant_id: TenantId) -> str:
+        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
+
+    def heatmap_key(self, tenant_id: TenantId) -> str:
+        return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
+
+    def heatmap_content(self, tenant_id: TenantId):
+        r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
+        return json.loads(r["Body"].read().decode("utf-8"))
+
 
 RemoteStorage = Union[LocalFsStorage, S3Storage]
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8f694de2e1..8ba9d767dd 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,3 +1,4 @@
+import json
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -10,7 +11,7 @@ from fixtures.pageserver.utils import (
     poll_for_remote_storage_iterations,
     tenant_delete_wait_completed,
 )
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -436,6 +437,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
     assert env.attachment_service is not None
+    assert isinstance(env.pageserver_remote_storage, S3Storage)  # Satisfy linter
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -491,18 +493,35 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     # Do evictions on attached pageserver, check secondary follows along
     # ==================================================================
-    log.info("Evicting a layer...")
-    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
-    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+    try:
+        log.info("Evicting a layer...")
+        layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+        some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
+        log.info(f"Victim layer: {layer_to_evict.name}")
+        ps_attached.http_client().evict_layer(
+            tenant_id, timeline_id, layer_name=layer_to_evict.name
+        )
 
-    log.info("Synchronizing after eviction...")
-    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+        log.info("Synchronizing after eviction...")
+        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        heatmap_layers = set(
+            layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
+        )
+        assert layer_to_evict.name not in heatmap_layers
+        assert some_other_layer.name in heatmap_layers
 
-    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
-    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-        ps_secondary, tenant_id, timeline_id
-    )
+        ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+        assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+            ps_secondary, tenant_id, timeline_id
+        )
+    except:
+        # On assertion failures, log some details to help with debugging
+        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
+        raise
 
     # Scrub the remote storage
     # ========================

From 3114be034a5845fa95ffe1e05f420eae9e84d031 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:31:28 +0400
Subject: [PATCH 03/52] proxy: change is cold start to enum (#6948)

## Problem

Actually it's good idea to distinguish between cases when it's a cold
start, but we took the compute from the pool

## Summary of changes

Updated to enum.
---
 proxy/src/console/messages.rs | 14 ++++++-
 proxy/src/context.rs          |  8 ++--
 proxy/src/context/parquet.rs  | 75 ++++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 1f94059f1e..85adb31654 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,4 +1,4 @@
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 
 use crate::auth::IpPattern;
@@ -98,7 +98,16 @@ pub struct MetricsAuxInfo {
     pub endpoint_id: EndpointId,
     pub project_id: ProjectId,
     pub branch_id: BranchId,
-    pub is_cold_start: Option<bool>,
+    pub cold_start_info: Option<ColdStartInfo>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum ColdStartInfo {
+    Unknown = 0,
+    Warm = 1,
+    PoolHit = 2,
+    PoolMiss = 3,
 }
 
 #[cfg(test)]
@@ -111,6 +120,7 @@ mod tests {
             "endpoint_id": "endpoint",
             "project_id": "project",
             "branch_id": "branch",
+            "cold_start_info": "unknown",
         })
     }
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index abad8a6412..1b48e01358 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
-    console::messages::MetricsAuxInfo,
+    console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
     BranchId, DbName, EndpointId, ProjectId, RoleName,
@@ -42,7 +42,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
-    is_cold_start: Option<bool>,
+    cold_start_info: Option<ColdStartInfo>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -91,7 +91,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            is_cold_start: None,
+            cold_start_info: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -115,7 +115,7 @@ impl RequestMonitoring {
         self.set_endpoint_id(x.endpoint_id);
         self.branch = Some(x.branch_id);
         self.project = Some(x.project_id);
-        self.is_cold_start = x.is_cold_start;
+        self.cold_start_info = x.cold_start_info;
     }
 
     pub fn set_project_id(&mut self, project_id: ProjectId) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 54f51604bf..1b1274b196 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -93,7 +93,7 @@ struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    is_cold_start: Option<bool>,
+    cold_start_info: Option<String>,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -121,7 +121,10 @@ impl From<RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            is_cold_start: value.is_cold_start,
+            cold_start_info: value
+                .cold_start_info
+                .as_ref()
+                .map(|x| serde_json::to_string(x).unwrap_or_default()),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -455,7 +458,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            is_cold_start: Some(true),
+            cold_start_info: Some("no".into()),
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -525,16 +528,16 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315032, 3, 6000),
-                (1315025, 3, 6000),
-                (1315085, 3, 6000),
-                (1315042, 3, 6000),
-                (1315172, 3, 6000),
-                (1315014, 3, 6000),
-                (1314806, 3, 6000),
-                (1315042, 3, 6000),
-                (438563, 1, 2000)
-            ],
+                (1314406, 3, 6000),
+                (1314399, 3, 6000),
+                (1314459, 3, 6000),
+                (1314416, 3, 6000),
+                (1314546, 3, 6000),
+                (1314388, 3, 6000),
+                (1314180, 3, 6000),
+                (1314416, 3, 6000),
+                (438359, 1, 2000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -563,12 +566,12 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220433, 5, 10000),
-                (1226583, 5, 10000),
-                (1228377, 5, 10000),
-                (1227739, 5, 10000),
-                (1219017, 5, 10000)
-            ],
+                (1220668, 5, 10000),
+                (1226818, 5, 10000),
+                (1228612, 5, 10000),
+                (1227974, 5, 10000),
+                (1219252, 5, 10000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -599,12 +602,12 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206080, 5, 10000),
-                (1205811, 5, 10000),
-                (1206104, 5, 10000),
-                (1206092, 5, 10000),
-                (1206347, 5, 10000)
-            ],
+                (1206315, 5, 10000),
+                (1206046, 5, 10000),
+                (1206339, 5, 10000),
+                (1206327, 5, 10000),
+                (1206582, 5, 10000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -628,16 +631,16 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315032, 3, 6000),
-                (1315025, 3, 6000),
-                (1315085, 3, 6000),
-                (1315042, 3, 6000),
-                (1315172, 3, 6000),
-                (1315014, 3, 6000),
-                (1314806, 3, 6000),
-                (1315042, 3, 6000),
-                (438563, 1, 2000)
-            ],
+                (1314406, 3, 6000),
+                (1314399, 3, 6000),
+                (1314459, 3, 6000),
+                (1314416, 3, 6000),
+                (1314546, 3, 6000),
+                (1314388, 3, 6000),
+                (1314180, 3, 6000),
+                (1314416, 3, 6000),
+                (438359, 1, 2000)
+            ]
         );
 
         tmpdir.close().unwrap();
@@ -673,7 +676,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
+            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
         );
 
         tmpdir.close().unwrap();

From 3fd77eb0d46dba7de3bd51ada2a7c46f56fd6f72 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 12:33:42 +0100
Subject: [PATCH 04/52] layer file creation: remove redundant fsync()s (#6983)

The `writer.finish()` methods already fsync the inode, using
`VirtualFile::sync_all()`.

All that the callers need to do is fsync their directory, i.e., the
timeline directory.

Note that there's a call in the new compaction code that is apparently
dead-at-runtime, so, I couldn't fix up any fsyncs there
[Link](https://github.com/neondatabase/neon/blob/502b69b33bbd4ad1b0647e921a9c665249a2cd62/pageserver/src/tenant/timeline/compaction.rs#L204-L211).

Note that layer durability still matters somewhat, even after #5198
which made remote storage authoritative.
We do have the layer file length as an indicator, but no checksums on
the layer file contents.
So, a series of overwrites without fsyncs in the middle, plus a
subsequent crash, could cause us to end up in a state where the file
length matches but the contents are garbage.

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant/timeline.rs | 63 ++++++-------------------------
 1 file changed, 11 insertions(+), 52 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 206f20306e..0c03ef33c3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -10,7 +10,7 @@ mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
@@ -3422,26 +3422,10 @@ impl Timeline {
                 let _g = span.entered();
                 let new_delta =
                     Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-                let new_delta_path = new_delta.local_path().to_owned();
 
-                // Sync it to disk.
-                //
-                // We must also fsync the timeline dir to ensure the directory entries for
-                // new layer files are durable.
-                //
-                // NB: timeline dir must be synced _after_ the file contents are durable.
-                // So, two separate fsyncs are required, they mustn't be batched.
-                //
-                // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
-                // files to flush, the fsync overhead can be reduces as follows:
-                // 1. write them all to temporary file names
-                // 2. fsync them
-                // 3. rename to the final name
-                // 4. fsync the parent directory.
-                // Note that (1),(2),(3) today happen inside write_to_disk().
-                //
-                // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-                par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
+                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+                // We just need to fsync the directory in which these inodes are linked,
+                // which we know to be the timeline directory.
                 par_fsync::par_fsync(&[self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
@@ -3674,25 +3658,10 @@ impl Timeline {
             }
         }
 
-        // Sync the new layer to disk before adding it to the layer map, to make sure
-        // we don't garbage collect something based on the new layer, before it has
-        // reached the disk.
-        //
-        // We must also fsync the timeline dir to ensure the directory entries for
-        // new layer files are durable
-        //
-        // Compaction creates multiple image layers. It would be better to create them all
-        // and fsync them all in parallel.
-        let all_paths = image_layers
-            .iter()
-            .map(|layer| layer.local_path().to_owned())
-            .collect::<Vec<_>>();
-
-        par_fsync::par_fsync_async(&all_paths)
-            .await
-            .context("fsync of newly created layer files")?;
-
-        if !all_paths.is_empty() {
+        // The writer.finish() above already did the fsync of the inodes.
+        // We just need to fsync the directory in which these inodes are linked,
+        // which we know to be the timeline directory.
+        if !image_layers.is_empty() {
             par_fsync::par_fsync_async(&[self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
@@ -4279,22 +4248,12 @@ impl Timeline {
                 }
             }
 
-            // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
-            let layer_paths: Vec<Utf8PathBuf> = new_layers
-                .iter()
-                .map(|l| l.local_path().to_owned())
-                .collect();
-
-            // Fsync all the layer files and directory using multiple threads to
-            // minimize latency.
-            par_fsync::par_fsync_async(&layer_paths)
-                .await
-                .context("fsync all new layers")?;
-
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
             let timeline_dir = self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id);
-
             par_fsync::par_fsync_async(&[timeline_dir])
                 .await
                 .context("fsync of timeline dir")?;

From 5c6d78d4692dcf1096cf95f759d89203f824bf07 Mon Sep 17 00:00:00 2001
From: Andreas Scherbaum <andreasscherbaum@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:02:18 +0100
Subject: [PATCH 05/52] Rename "zenith" to "neon" (#6957)

Usually RFC documents are not modified, but the vast mentions of
"zenith" in early RFC documents make it desirable to update the product
name to today's name, to avoid confusion.

## Problem

Early RFC documents use the old "zenith" product name a lot, which is
not something everyone is aware of after the product was renamed.

## Summary of changes

Replace occurrences of "zenith" with "neon".
Images are excluded.

---------

Co-authored-by: Andreas Scherbaum <andreas@neon.tech>
---
 docs/rfcs/002-storage.md                      |   2 +-
 docs/rfcs/003-laptop-cli.md                   | 122 +++++++++---------
 docs/rfcs/004-durability.md                   |   2 +-
 docs/rfcs/005-zenith_local.md                 |  46 +++----
 docs/rfcs/006-laptop-cli-v2-CLI.md            |  48 +++----
 .../006-laptop-cli-v2-repository-structure.md |  44 +++----
 docs/rfcs/007-serverless-on-laptop.md         |  26 ++--
 docs/rfcs/008-push-pull.md                    |  12 +-
 docs/rfcs/009-snapshot-first-storage-cli.md   |  20 +--
 docs/rfcs/013-term-history.md                 |   2 +-
 docs/rfcs/014-safekeepers-gossip.md           |   2 +-
 docs/rfcs/015-storage-messaging.md            |   4 +-
 12 files changed, 165 insertions(+), 165 deletions(-)

diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md
index f99683cf09..d11b750e73 100644
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -1,4 +1,4 @@
-# Zenith storage node — alternative
+# Neon storage node — alternative
 
 ## **Design considerations**
 
diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md
index 1a549c2df5..003a05bd16 100644
--- a/docs/rfcs/003-laptop-cli.md
+++ b/docs/rfcs/003-laptop-cli.md
@@ -1,6 +1,6 @@
 # Command line interface (end-user)
 
-Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start.
+Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start.
 
 This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots.
 
@@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle
 
 # Possible usage scenarios
 
-## Install zenith, run a postgres
+## Install neon, run a postgres
 
 ```
-> brew install pg-zenith 
-> zenith pg create # creates pgdata with default pattern pgdata$i
-> zenith pg list
+> brew install pg-neon 
+> neon pg create # creates pgdata with default pattern pgdata$i
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       0G      zenith-local       localhost:5432
+primary1      pgdata1       0G      neon-local       localhost:5432
 ```
 
-## Import standalone postgres to zenith
+## Import standalone postgres to neon
 
 ```
-> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg
+> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg
 [====================------------] 60% | 20MB/s
-> zenith snapshot list
+> neon snapshot list
 ID          SIZE        PARENT
 oldpg       5G          -
 
-> zenith pg create --snapshot oldpg
+> neon pg create --snapshot oldpg
 Started postgres on localhost:5432
 
-> zenith pg list
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      zenith-local       localhost:5432
+primary1      pgdata1       5G      neon-local       localhost:5432
 
-> zenith snapshot destroy oldpg
+> neon snapshot destroy oldpg
 Ok
 ```
 
 Also, we may start snapshot import implicitly by looking at snapshot schema
 
 ```
-> zenith pg create --snapshot basebackup://replication@localhost:5432/
+> neon pg create --snapshot basebackup://replication@localhost:5432/
 Downloading snapshot... Done.
 Started postgres on localhost:5432
 Destroying snapshot... Done.
@@ -52,39 +52,39 @@ Destroying snapshot... Done.
 Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage).
 
 ```
-> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies
+> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies
 ```
 
 ## Create snapshot and push it to the cloud
 
 ```
-> zenith snapshot create pgdata1@snap1
-> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1
+> neon snapshot create pgdata1@snap1
+> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1
 ```
 
 ## Rollback database to the snapshot
 
-One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`.
+One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`.
 
 ```
-> zenith pg list
+> neon pg list
 ID            PGDATA        USED    STORAGE            ENDPOINT
-primary1      pgdata1       5G      zenith-local       localhost:5432
+primary1      pgdata1       5G      neon-local       localhost:5432
 
-> zenith snapshot create pgdata1@snap1
+> neon snapshot create pgdata1@snap1
 
-> zenith snapshot list
+> neon snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
 pgdata1@CURRENT       6G          -
 
-> zenith pg checkout pgdata1@snap1
+> neon pg checkout pgdata1@snap1
 Stopping postgres on pgdata1.
 Rolling back pgdata1@CURRENT to pgdata1@snap1.
 Starting postgres on pgdata1.
 
-> zenith snapshot list
+> neon snapshot list
 ID                    SIZE        PARENT
 oldpg                 5G          -
 pgdata1@snap1         6G          -
@@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state
 PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite).
 
 ```
-> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month
+> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month
 ```
 
 Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area.
@@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o
 
 ## storage
 
-Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
+Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default.
 
-**zenith storage attach** -t [native|s3] -c key=value -n name
+**neon storage attach** -t [native|s3] -c key=value -n name
 
-Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'.
+Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'.
 
 
-**zenith storage list**
+**neon storage list**
 
 Show currently attached storages. For example:
 
 ```
-> zenith storage list
+> neon storage list
 NAME            USED    TYPE                OPTIONS          PATH
-local           5.1G    zenith-local                         /opt/zenith/store/local
-local.compr     20.4G   zenith-local        compression=on    /opt/zenith/store/local.compr
-zcloud          60G     zenith-remote                        zenith.tech/stas/mystore
+local           5.1G    neon-local                         /opt/neon/store/local
+local.compr     20.4G   neon-local        compression=on    /opt/neon/store/local.compr
+zcloud          60G     neon-remote                        neon.tech/stas/mystore
 s3tank          80G     S3
 ```
 
-**zenith storage detach**
+**neon storage detach**
 
-**zenith storage show**
+**neon storage show**
 
 
@@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c
 
 Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.
 
-**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
+**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata
 
 Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr.
 
 --no-start: just init datadir without creating 
 
---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1)
+--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1)
 
 --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database)
 
-**zenith pg destroy**
+**neon pg destroy**
 
-**zenith pg start** [--replica] pgdata
+**neon pg start** [--replica] pgdata
 
 Start postgres with proper extensions preloaded/installed.
 
-**zenith pg checkout**
+**neon pg checkout**
 
 Rollback data directory to some previous snapshot. 
 
-**zenith pg stop** pg_id
+**neon pg stop** pg_id
 
-**zenith pg list**
+**neon pg list**
 
 ```
 ROLE                 PGDATA        USED    STORAGE            ENDPOINT
@@ -173,7 +173,7 @@ primary              my_pg2        3.2G    local.compr        localhost:5435
 -                    my_pg3        9.2G    local.compr        -
 ```
 
-**zenith pg show**
+**neon pg show**
 
 ```
 my_pg:
@@ -194,7 +194,7 @@ my_pg:
 
 ```
 
-**zenith pg start-rest/graphql** pgdata
+**neon pg start-rest/graphql** pgdata
 
 Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea.
 
@@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that,
 
 Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout.
 
-**zenith snapshot create** pgdata_name@snap_name
+**neon snapshot create** pgdata_name@snap_name
 
 Creates a new snapshot in the same storage where pgdata_name exists.
 
-**zenith snapshot push** --to url pgdata_name@snap_name
+**neon snapshot push** --to url pgdata_name@snap_name
 
-Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go.
+Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go.
 
-**zenith snapshot recv**
+**neon snapshot recv**
 
 Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket.
 
-**zenith snapshot pull** --from url or path
+**neon snapshot pull** --from url or path
 
-Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format.
+Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format.
 
-**zenith snapshot import** --from basebackup://<...>  or path
+**neon snapshot import** --from basebackup://<...>  or path
 
 Creates a new snapshot out of running postgres via basebackup protocol or basebackup files.
 
-**zenith snapshot export**
+**neon snapshot export**
 
-Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay).
+Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay).
 
-**zenith snapshot diff** snap1 snap2
+**neon snapshot diff** snap1 snap2
 
 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses.
 
-**zenith snapshot destroy**
+**neon snapshot destroy**
 
 ## pitr
 
@@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream
 
 XXX: any suggestions on a better name?
 
-**zenith pitr create** name
+**neon pitr create** name
 
 --ttl = inf | period
 
@@ -247,21 +247,21 @@ XXX: any suggestions on a better name?
 
 --storage = storage_name
 
-**zenith pitr extract-snapshot** pitr_name --lsn xxx
+**neon pitr extract-snapshot** pitr_name --lsn xxx
 
 Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export)
 
-**zenith pitr gc** pitr_name
+**neon pitr gc** pitr_name
 
 Force garbage collection on some PITR area.
 
-**zenith pitr list**
+**neon pitr list**
 
-**zenith pitr destroy**
+**neon pitr destroy**
 
 
 ## console
 
-**zenith console**
+**neon console**
 
 Opens browser targeted at web console with the more or less same functionality as described here.
diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md
index d4716156d1..6b83c77403 100644
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can
 acknowledge the commit to the client and be reasonably certain that we
 will not lose the transaction?
 
-Zenith uses a group of WAL safekeeper nodes to hold the generated WAL.
+Neon uses a group of WAL safekeeper nodes to hold the generated WAL.
 A WAL record is considered durable, when it has been written to a
 majority of WAL safekeeper nodes. In this document, I use 5
 safekeepers, because I have five fingers. A WAL record is durable,
diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md
index e36d0a9ae3..6c283d7a37 100644
--- a/docs/rfcs/005-zenith_local.md
+++ b/docs/rfcs/005-zenith_local.md
@@ -1,23 +1,23 @@
-# Zenith local
+# Neon local
 
-Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
+Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together.  Your comments on both parts are very welcome.
 
 #### Why do we need it?
 - For distribution - this easy to use binary will help us to build adoption among developers.
 - For internal use - to test all components together.
 
-In my understanding, we consider it to be just a mock-up version of zenith-cloud.
+In my understanding, we consider it to be just a mock-up version of neon-cloud.
 > Question: How much should we care about durability and security issues for a local setup?
 
 
 #### Why is it better than a simple local postgres?
 
-- Easy one-line setup. As simple as `cargo install zenith && zenith start`
+- Easy one-line setup. As simple as `cargo install neon && neon start`
 
 - Quick and cheap creation of compute nodes over the same storage.
 > Question: How can we describe a use-case for this feature?
 
-- Zenith-local can work with S3 directly. 
+- Neon-local can work with S3 directly. 
 
 - Push and pull images (snapshots) to remote S3 to exchange data with other users.
 
@@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need.
 
 #### Components:
 
-- **zenith-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
-CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
-WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli
+- **neon-CLI** - interface for end-users.  Turns commands to REST requests and handles responses to show them in a user-friendly way.  
+CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli
 
-- **zenith-console** - WEB UI with same functionality as CLI.
+- **neon-console** - WEB UI with same functionality as CLI.
 >Note: not for the first release.
 
-- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
-    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local.
+- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below.
+    > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local.
 
-- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
+- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation).
 > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server?
 
-WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src
 
-- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith.
+- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon.
 > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)?
 > Question: Do we use it together with local page store or they are interchangeable?
 
 WIP code is ???
 
-- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
+- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed.
 > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system.
 
-WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper
+WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper
 
-- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
+- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database.
  
- WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node
+ WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node
 
 #### REST API:
 
 Service endpoint: `http://localhost:3000`
 
 Resources:
-- /storages - Where data lives: zenith-pageserver or zenith-s3
-- /pgs - Postgres - zenith-computenode
+- /storages - Where data lives: neon-pageserver or neon-s3
+- /pgs - Postgres - neon-computenode
 - /snapshots - snapshots **TODO**
 
->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
+>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all?
 
 Methods and their mapping to CLI:
 
-- /storages - zenith-pageserver or zenith-s3
+- /storages - neon-pageserver or neon-s3
 
 CLI  | REST API
 ------------- | -------------
@@ -84,7 +84,7 @@ storage list | GET /storages
 storage show -n name | GET /storages/:storage_name 
 
 
-- /pgs - zenith-computenode
+- /pgs - neon-computenode
 
 CLI  | REST API
 ------------- | -------------
diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md
index 84dc932211..5030ecc7e7 100644
--- a/docs/rfcs/006-laptop-cli-v2-CLI.md
+++ b/docs/rfcs/006-laptop-cli-v2-CLI.md
@@ -1,45 +1,45 @@
-Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
+Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog".
 
 # CLI v2 (after chatting with Carl)
 
-Zenith introduces the notion of a repository.
+Neon introduces the notion of a repository.
 
 ```bash
-zenith init
-zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory
+neon init
+neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory
 ```
 
 Once you have a cluster catalog you can explore it
 
 ```bash
-zenith log -- returns a list of commits
-zenith status -- returns if there are changes in the catalog that can be committed
-zenith commit -- commits the changes and generates a new commit hash
-zenith branch experimental <hash> -- creates a branch called testdb based on a given commit hash
+neon log -- returns a list of commits
+neon status -- returns if there are changes in the catalog that can be committed
+neon commit -- commits the changes and generates a new commit hash
+neon branch experimental <hash> -- creates a branch called testdb based on a given commit hash
 ```
 
 To make changes in the catalog you need to run compute nodes
 
 ```bash
 -- here is how you a compute node
-zenith start /home/pipedpiper/northwind:main -- starts a compute instance
-zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
+neon start /home/pipedpiper/northwind:main -- starts a compute instance
+neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud
 -- you can start a compute node against any hash or branch
-zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
+neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
 -- you can start a compute node against any hash or branch
-zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
+neon start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
 
 -- After running some DML you can run 
--- zenith status and see how there are two WAL streams one on top of 
+-- neon status and see how there are two WAL streams one on top of 
 -- the main branch
-zenith status 
+neon status 
 -- and another on top of the experimental branch
-zenith status -b experimental
+neon status -b experimental
 
 -- you can commit each branch separately
-zenith commit main
+neon commit main
 -- or
-zenith commit -c /home/pipedpiper/northwind:experimental
+neon commit -c /home/pipedpiper/northwind:experimental
 ```
 
 Starting compute instances against cloud environments
@@ -47,18 +47,18 @@ Starting compute instances against cloud environments
 ```bash
 -- you can start a compute instance against the cloud environment
 -- in this case all of the changes will be streamed into the cloud
-zenith start https://zenith:tech/pipedpiper/northwind:main
-zenith start https://zenith:tech/pipedpiper/northwind:main
-zenith status -c https://zenith:tech/pipedpiper/northwind:main
-zenith commit -c https://zenith:tech/pipedpiper/northwind:main
-zenith branch -c https://zenith:tech/pipedpiper/northwind:<hash> experimental
+neon start https://neon:tecj/pipedpiper/northwind:main
+neon start https://neon:tecj/pipedpiper/northwind:main
+neon status -c https://neon:tecj/pipedpiper/northwind:main
+neon commit -c https://neon:tecj/pipedpiper/northwind:main
+neon branch -c https://neon:tecj/pipedpiper/northwind:<hash> experimental
 ```
 
 Pushing data into the cloud
 
 ```bash
 -- pull all the commits from the cloud
-zenith pull
+neon pull
 -- push all the commits to the cloud
-zenith push
+neon push
 ```
diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
index e6e6e172ad..749a940313 100644
--- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md
+++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md
@@ -1,14 +1,14 @@
 # Repository format
 
-A Zenith repository is similar to a traditional PostgreSQL backup
+A Neon repository is similar to a traditional PostgreSQL backup
 archive, like a WAL-G bucket or pgbarman backup catalogue. It holds
 multiple versions of a PostgreSQL database cluster.
 
-The distinguishing feature is that you can launch a Zenith Postgres
+The distinguishing feature is that you can launch a Neon Postgres
 server directly against a branch in the repository, without having to
-"restore" it first. Also, Zenith manages the storage automatically,
+"restore" it first. Also, Neon manages the storage automatically,
 there is no separation between full and incremental backups nor WAL
-archive. Zenith relies heavily on the WAL, and uses concepts similar
+archive. Neon relies heavily on the WAL, and uses concepts similar
 to incremental backups and WAL archiving internally, but it is hidden
 from the user.
 
@@ -19,15 +19,15 @@ efficient. Just something to get us started.
 
 The repository directory looks like this:
 
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
-    .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots/<lsn>/
+    .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history
     
-    .zenith/refs/branches/mybranch
-    .zenith/refs/tags/foo
-    .zenith/refs/tags/bar
+    .neon/refs/branches/mybranch
+    .neon/refs/tags/foo
+    .neon/refs/tags/bar
     
-    .zenith/datadirs/<timeline uuid>
+    .neon/datadirs/<timeline uuid>
 
 ### Timelines
 
@@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node
 against a tag or arbitrary LSN on a timeline, but in order to write,
 you need to create a timeline.
 
-Each timeline is stored in a directory under .zenith/timelines. It
+Each timeline is stored in a directory under .neon/timelines. It
 consists of a WAL archive, containing all the WAL in the standard
 PostgreSQL format, under the wal/ subdirectory.
 
@@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags).
 
 ### Datadirs
 
-.zenith/datadirs contains PostgreSQL data directories. You can launch
+.neon/datadirs contains PostgreSQL data directories. You can launch
 a Postgres instance on one of them with:
 
 ```
-  postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
+  postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c
 ```
 
 All the actual data is kept in the timeline directories, under
-.zenith/timelines. The data directories are only needed for active
+.neon/timelines. The data directories are only needed for active
 PostgreQSL instances. After an instance is stopped, the data directory
-can be safely removed. "zenith start" will recreate it quickly from
-the data in .zenith/timelines, if it's missing.
+can be safely removed. "neon start" will recreate it quickly from
+the data in .neon/timelines, if it's missing.
 
 ## Version 2
 
@@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support:
 
 ### Garbage collection
 
-When you run "zenith gc", old timelines that are no longer needed are
+When you run "neon gc", old timelines that are no longer needed are
 removed. That involves collecting the list of "unreachable" objects,
 starting from the named branches and tags.
 
 Also, if enough WAL has been generated on a timeline since last
 snapshot, a new snapshot or delta is created.
 
-### zenith push/pull
+### neon push/pull
 
 Compare the tags and branches on both servers, and copy missing ones.
 For each branch, compare the timeline it points to in both servers. If
@@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the
 timelines have diverged. That would match with the "epoch" concept
 that we have in the WAL safekeeper
 
-### zenith checkout/commit
+### neon checkout/commit
 
 In this format, there is no concept of a "working tree", and hence no
 concept of checking out or committing. All modifications are done on
@@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree".
 You can later remove it and have it garbage collected, or to "commit",
 re-point the branch to the new timeline.
 
-If we want to have a worktree and "zenith checkout/commit" concept, we can
+If we want to have a worktree and "neon checkout/commit" concept, we can
 emulate that with a temporary timeline. Create the temporary timeline at
-"zenith checkout", and have "zenith commit" modify the branch to point to
+"neon checkout", and have "neon commit" modify the branch to point to
 the new timeline.
diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md
index e6355f4a03..96f117bfe9 100644
--- a/docs/rfcs/007-serverless-on-laptop.md
+++ b/docs/rfcs/007-serverless-on-laptop.md
@@ -4,27 +4,27 @@ How it works now
 1. Create repository, start page server on it
 
 ```
-$ zenith init
+$ neon init
 ...
 created main branch
-new zenith repository was created in .zenith
+new neon repository was created in .neon
 
-$ zenith pageserver start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+$ neon pageserver start
+Starting pageserver at '127.0.0.1:64000' in .neon
 Page server started
 ```
 
 2. Create a branch, and start a Postgres instance on it
 
 ```
-$ zenith branch heikki main
+$ neon branch heikki main
 branching at end of WAL: 0/15ECF68
 
-$ zenith pg create heikki
+$ neon pg create heikki
 Initializing Postgres on timeline 76cf9279915be7797095241638e64644...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432
 
-$ zenith pg start pg1
+$ neon pg start pg1
 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki'
 waiting for server to start.... done
 server started
@@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just:
 1. Create repository, start page server on it (same as before)
 
 ```
-$ zenith init
+$ neon init
 ...
 created main branch
-new zenith repository was created in .zenith
+new neon repository was created in .neon
 
-$ zenith pageserver start
-Starting pageserver at '127.0.0.1:64000' in .zenith
+$ neon pageserver start
+Starting pageserver at '127.0.0.1:64000' in .neon
 Page server started
 ```
 
 2. Create branch
 
 ```
-$ zenith branch heikki main
+$ neon branch heikki main
 branching at end of WAL: 0/15ECF68
 ```
 
diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md
index 272628e1ce..a36932222a 100644
--- a/docs/rfcs/008-push-pull.md
+++ b/docs/rfcs/008-push-pull.md
@@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W
 The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that).
 
 ```
-zenith origin add <name> <connection_uri>
-zenith origin list
-zenith origin remove <name>
+neon origin add <name> <connection_uri>
+neon origin list
+neon origin remove <name>
 ```
 
 Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport.
 
-Behind the scenes, this commands may update toml file inside .zenith directory.
+Behind the scenes, this commands may update toml file inside .neon directory.
 
 ## Push
 
 ### Pushing branch
 
 ```
-zenith push mybranch cloudserver # push to eponymous branch in cloudserver
-zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
+neon push mybranch cloudserver # push to eponymous branch in cloudserver
+neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver
 ```
 
 Exact mechanics would be slightly different in the following situations:
diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md
index 0acbd68f86..bbd0f75fe2 100644
--- a/docs/rfcs/009-snapshot-first-storage-cli.md
+++ b/docs/rfcs/009-snapshot-first-storage-cli.md
@@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well
 
 We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
 
-Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.
+Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon.
 
 So here is an attempt to design consistent CLI for different usage scenarios:
 
@@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config.
 Push snapshots to `storage_dest` in background.
 
 ```
-zenith init --storage_dest=S3_PREFIX
-zenith start
+neon init --storage_dest=S3_PREFIX
+neon start
 ```
 
 #### 2. Restart pageserver (manually or crash-recovery).
@@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho
 Push snapshots to `storage_dest` in background.
 
 ```
-zenith start
+neon start
 ```
 
 #### 3. Import.
@@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time
 Save`storage_dest` parameters in config.
 Push snapshots to `storage_dest` in background.
 ```
-//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
-zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
-zenith start
+//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage.
+neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX
+neon start
 ```
 How to pass credentials needed for `snapshot_path`?
 
 #### 4. Export.
 Manually push snapshot to `snapshot_path` which differs from `storage_dest`
-Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
+Optionally set `snapshot_format`, which can be plain pgdata format or neon format.
 ```
-zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
+neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
 ```
 
 #### Notes and questions
 - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
-- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
+- Why do we need `neon init` as a separate command? Can't we init everything at first start?
 - We can think of better names for all options.
 - Export to plain postgres format will be useless, if we are not 100% compatible on page level.
 I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md
index 7e815abf73..2f3ccbc09b 100644
--- a/docs/rfcs/013-term-history.md
+++ b/docs/rfcs/013-term-history.md
@@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when
 it has received all committed log records from all `< n` terms. This roughly
 corresponds to proposed in
 
-https://github.com/zenithdb/rfcs/pull/3/files
+https://github.com/neondatabase/rfcs/pull/3/files
 
 
 This makes our biggest our difference from Raft. In Raft, every log record is
diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md
index 3d6cc04b94..ff38a0a0ef 100644
--- a/docs/rfcs/014-safekeepers-gossip.md
+++ b/docs/rfcs/014-safekeepers-gossip.md
@@ -1,6 +1,6 @@
 # Safekeeper gossip
 
-Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
+Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13)
 
 ## Motivation
 
diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md
index a415b90459..7702311d65 100644
--- a/docs/rfcs/015-storage-messaging.md
+++ b/docs/rfcs/015-storage-messaging.md
@@ -2,7 +2,7 @@
 
 Created on 19.01.22
 
-Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
+Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich.
 
 That it is an alternative to (014-safekeeper-gossip)[]
 
@@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation:
 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
 2. etcd uses Grpc as a protocol, and messages are pretty simple
 
-So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
+So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).

From 6e46204712a68e34b40caaa9cf01c7f4141ab0a1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 12:08:44 +0000
Subject: [PATCH 06/52] CI(deploy): use separate workflow for proxy deploys
 (#6995)

## Problem

The current implementation of `deploy-prod` workflow doesn't allow to
run parallel deploys on Storage and Proxy.

## Summary of changes
- Call `deploy-proxy-prod` workflow that deploys only Proxy components,
and that can be run in parallel with `deploy-prod` for Storage.
---
 .github/workflows/build_and_test.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2e52e7c28f..276c71c6e0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1132,11 +1132,9 @@ jobs:
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
-              -f deployStorage=false \
-              -f deployStorageBroker=false \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           else

From c861d71eeb6d3acfc4c99ced41dd0df778cda802 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 13:18:22 +0100
Subject: [PATCH 07/52] layer file creation: fatal_err on timeline dir fsync
 (#6985)

As pointed out in the comments added in this PR:
the in-memory state of the filesystem already has the layer file in its
final place.
If the fsync fails, but pageserver continues to execute, it's quite easy
for subsequent pageserver code to observe the file being there and
assume it's durable, when it really isn't.

It can happen that we get ENOSPC during the fsync.
However,
1. the timeline dir is small (remember, the big layer _file_ has already
been synced).
Small data means ENOSPC due to delayed allocation races etc are less
likely.
2. what else are we going to do in that case?

If we decide to bubble up the error, the file remains on disk.
We could try to unlink it and fsync after the unlink.
If that fails, we would _definitely_ need to error out.
Is it worth the trouble though?

Side note: all this logic about not carrying on after fsync failure
implies that we `sync` the filesystem successfully before we restart
the pageserver. We don't do that right now, but should (=>
https://github.com/neondatabase/neon/issues/6989)

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant/timeline.rs | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0c03ef33c3..0a2ae5d8bd 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -50,7 +50,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::{Gate, GateGuard};
 
-use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -75,6 +74,10 @@ use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    virtual_file::MaybeFatalIo,
+};
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
@@ -3426,10 +3429,14 @@ impl Timeline {
                 // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
                 // We just need to fsync the directory in which these inodes are linked,
                 // which we know to be the timeline directory.
+                //
+                // We use fatal_err() below because the after write_to_disk returns with success,
+                // the in-memory state of the filesystem already has the layer file in its final place,
+                // and subsequent pageserver code could think it's durable while it really isn't.
                 par_fsync::par_fsync(&[self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
-                .context("fsync of timeline dir")?;
+                .fatal_err("fsync of timeline dir");
 
                 anyhow::Ok(new_delta)
             }
@@ -3662,11 +3669,14 @@ impl Timeline {
         // We just need to fsync the directory in which these inodes are linked,
         // which we know to be the timeline directory.
         if !image_layers.is_empty() {
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
             par_fsync::par_fsync_async(&[self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
             .await
-            .context("fsync of timeline dir")?;
+            .fatal_err("fsync of timeline dir");
         }
 
         let mut guard = self.layers.write().await;
@@ -4251,12 +4261,16 @@ impl Timeline {
             // The writer.finish() above already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
             let timeline_dir = self
                 .conf
                 .timeline_path(&self.tenant_shard_id, &self.timeline_id);
             par_fsync::par_fsync_async(&[timeline_dir])
                 .await
-                .context("fsync of timeline dir")?;
+                .fatal_err("fsync of timeline dir");
         }
 
         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();

From e1c032fb3ccabf61f5d41301cedbbb11a3d303a6 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 4 Mar 2024 17:26:16 +0400
Subject: [PATCH 08/52] Fix type (#6998)

## Problem

Typo

## Summary of changes

Fix
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 80a718d61a..b2c9a19588 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -97,7 +97,7 @@ jobs:
           **Please merge this Pull Request using 'Create a merge commit' button**
         EOF
 
-        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+        gh pr create --title "Proxy release ${RELEASE_DATE}" \
                      --body-file "body.md" \
                      --head "${RELEASE_BRANCH}" \
                      --base "release-proxy"

From 944cac950d9a151d7408f544952c4fdabb9cc9dd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 14:31:09 +0100
Subject: [PATCH 09/52] layer file creation: fsync timeline directories using
 `VirtualFile::sync_all()` (#6986)

Except for the involvement of the VirtualFile fd cache, this is
equivalent to what happened before at runtime.

Future PR https://github.com/neondatabase/neon/pull/6378 will implement
`VirtualFile::sync_all()` using
tokio-epoll-uring if that's configured as the io engine.
This PR is preliminary work for that.

part of https://github.com/neondatabase/neon/issues/6663
---
 pageserver/src/tenant.rs           |  1 -
 pageserver/src/tenant/par_fsync.rs | 84 ------------------------------
 pageserver/src/tenant/timeline.rs  | 79 ++++++++++++++++------------
 3 files changed, 46 insertions(+), 118 deletions(-)
 delete mode 100644 pageserver/src/tenant/par_fsync.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4158133111..3423b50eaa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -151,7 +151,6 @@ pub(crate) mod ephemeral_file;
 pub mod layer_map;
 
 pub mod metadata;
-mod par_fsync;
 pub mod remote_timeline_client;
 pub mod storage_layer;
 
diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs
deleted file mode 100644
index 3acb0fb431..0000000000
--- a/pageserver/src/tenant/par_fsync.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-use std::{
-    io,
-    sync::atomic::{AtomicUsize, Ordering},
-};
-
-use camino::{Utf8Path, Utf8PathBuf};
-
-fn fsync_path(path: &Utf8Path) -> io::Result<()> {
-    // TODO use VirtualFile::fsync_all once we fully go async.
-    let file = std::fs::File::open(path)?;
-    file.sync_all()
-}
-
-fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
-    while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) {
-        fsync_path(path)?;
-    }
-
-    Ok(())
-}
-
-fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
-
-    /// Use at most this number of threads.
-    /// Increasing this limit will
-    /// - use more memory
-    /// - increase the cost of spawn/join latency
-    const MAX_NUM_THREADS: usize = 64;
-    let num_threads = paths.len().min(MAX_NUM_THREADS);
-    let next_path_idx = AtomicUsize::new(0);
-
-    std::thread::scope(|s| -> io::Result<()> {
-        let mut handles = vec![];
-        // Spawn `num_threads - 1`, as the current thread is also a worker.
-        for _ in 1..num_threads {
-            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
-        }
-
-        parallel_worker(paths, &next_path_idx)?;
-
-        for handle in handles {
-            handle.join().unwrap()?;
-        }
-
-        Ok(())
-    })
-}
-
-/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
-pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    if paths.len() == 1 {
-        fsync_path(&paths[0])?;
-        return Ok(());
-    }
-
-    fsync_in_thread_pool(paths)
-}
-
-/// Parallel fsync asynchronously.
-pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
-    const MAX_CONCURRENT_FSYNC: usize = 64;
-    let mut next = paths.iter().peekable();
-    let mut js = tokio::task::JoinSet::new();
-    loop {
-        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
-            let next = next.next().expect("just peeked");
-            let next = next.to_owned();
-            js.spawn_blocking(move || fsync_path(&next));
-        }
-
-        // now the joinset has been filled up, wait for next to complete
-        if let Some(res) = js.join_next().await {
-            res??;
-        } else {
-            // last item had already completed
-            assert!(
-                next.peek().is_none(),
-                "joinset emptied, we shouldn't have more work"
-            );
-            return Ok(());
-        }
-    }
-}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0a2ae5d8bd..64c324a5c8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -54,7 +54,6 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
-    par_fsync,
 };
 use crate::{
     context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
@@ -76,7 +75,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
-    virtual_file::MaybeFatalIo,
+    virtual_file::{MaybeFatalIo, VirtualFile},
 };
 
 use crate::config::PageServerConf;
@@ -3417,28 +3416,31 @@ impl Timeline {
             let frozen_layer = Arc::clone(frozen_layer);
             let ctx = ctx.attached_child();
             move || {
-                // Write it out
-                // Keep this inside `spawn_blocking` and `Handle::current`
-                // as long as the write path is still sync and the read impl
-                // is still not fully async. Otherwise executor threads would
-                // be blocked.
-                let _g = span.entered();
-                let new_delta =
-                    Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-
-                // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                // We just need to fsync the directory in which these inodes are linked,
-                // which we know to be the timeline directory.
-                //
-                // We use fatal_err() below because the after write_to_disk returns with success,
-                // the in-memory state of the filesystem already has the layer file in its final place,
-                // and subsequent pageserver code could think it's durable while it really isn't.
-                par_fsync::par_fsync(&[self_clone
-                    .conf
-                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
-                .fatal_err("fsync of timeline dir");
-
-                anyhow::Ok(new_delta)
+                Handle::current().block_on(
+                    async move {
+                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+                        // We just need to fsync the directory in which these inodes are linked,
+                        // which we know to be the timeline directory.
+                        //
+                        // We use fatal_err() below because the after write_to_disk returns with success,
+                        // the in-memory state of the filesystem already has the layer file in its final place,
+                        // and subsequent pageserver code could think it's durable while it really isn't.
+                        let timeline_dir =
+                            VirtualFile::open(&self_clone.conf.timeline_path(
+                                &self_clone.tenant_shard_id,
+                                &self_clone.timeline_id,
+                            ))
+                            .await
+                            .fatal_err("VirtualFile::open for timeline dir fsync");
+                        timeline_dir
+                            .sync_all()
+                            .await
+                            .fatal_err("VirtualFile::sync_all timeline dir");
+                        anyhow::Ok(new_delta)
+                    }
+                    .instrument(span),
+                )
             }
         })
         .await
@@ -3672,11 +3674,17 @@ impl Timeline {
             // We use fatal_err() below because the after writer.finish() returns with success,
             // the in-memory state of the filesystem already has the layer file in its final place,
             // and subsequent pageserver code could think it's durable while it really isn't.
-            par_fsync::par_fsync_async(&[self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
             .await
-            .fatal_err("fsync of timeline dir");
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
         }
 
         let mut guard = self.layers.write().await;
@@ -4265,12 +4273,17 @@ impl Timeline {
             // We use fatal_err() below because the after writer.finish() returns with success,
             // the in-memory state of the filesystem already has the layer file in its final place,
             // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = self
-                .conf
-                .timeline_path(&self.tenant_shard_id, &self.timeline_id);
-            par_fsync::par_fsync_async(&[timeline_dir])
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
                 .await
-                .fatal_err("fsync of timeline dir");
+                .fatal_err("VirtualFile::sync_all timeline dir");
         }
 
         stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();

From e938bb815763d1980540c8fa84781e160688d44a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 4 Mar 2024 09:17:14 -0500
Subject: [PATCH 10/52] fix epic issue template (#6920)

The template does not parse on GitHub
---
 .github/ISSUE_TEMPLATE/epic-template.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md
index 019e6e7345..c442f50fde 100644
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -16,9 +16,9 @@ assignees: ''
 
 ## Implementation ideas
 
-
+## Tasks
 ```[tasklist]
-### Tasks
+- [ ] Example Task
 ```
 
 
From f0be9400f25cfbad356f5417e199325d2c12f7df Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 15:47:13 +0100
Subject: [PATCH 11/52] fix(test_remote_storage_upload_queue_retries): became
 flakier since #6960 (#6999)

This PR increases the `wait_until` timeout.
These are where things became more flaky as of
https://github.com/neondatabase/neon/pull/6960.
Most likely because it doubles the work in the
`churn_while_failpoints_active_thread`.

Slack context:
https://neondb.slack.com/archives/C033RQ5SPDH/p1709554455962959?thread_ts=1709286362.850549&cid=C033RQ5SPDH
---
 test_runner/regress/test_remote_storage.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f8a0bef954..06c13cc07d 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -329,14 +329,15 @@ def test_remote_storage_upload_queue_retries(
     churn_while_failpoints_active_thread.start()
 
     # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    # Exponential back-off in upload queue, so, gracious timeouts.
+
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
-    # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
     wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))

From 0d2395fe96dfadaea3b026990b5a77aa4a72c0e4 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 4 Mar 2024 18:02:10 +0200
Subject: [PATCH 12/52] Update postgres-exporter to v0.12.1 (#7004)

Fixes https://github.com/neondatabase/neon/issues/6996

Thanks to @bayandin
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 4520a5fc9c..a04dac6336 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -176,7 +176,7 @@ build: |
       # actually build the thing...
       && make install
 
-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
+  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
 
   FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
 

From 191d8ac7e044e867b07f5007b783d00d0a87be45 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 16:04:12 +0000
Subject: [PATCH 13/52] vm-image: update pgbouncer from 1.22.0 to 1.22.1
 (#7005)

pgbouncer 1.22.1 has been released
> This release fixes issues caused by some clients using COPY FROM STDIN
queries. Such queries could introduce memory leaks, performance
regressions and prepared statement misbehavior.

- NEWS: https://www.pgbouncer.org/2024/03/pgbouncer-1-22-1
- CHANGES:
https://github.com/pgbouncer/pgbouncer/compare/pgbouncer_1_22_0...pgbouncer_1_22_1


## Summary of changes
- vm-image: update pgbouncer from 1.22.0 to 1.22.1
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index a04dac6336..c1b7ad533a 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -193,7 +193,7 @@ build: |
           pkg-config
 
   # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_0
+  ENV PGBOUNCER_TAG pgbouncer_1_22_1
   RUN set -e \
       && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
       && cd pgbouncer \

From e62baa97041e10ce45772b3724e24e679a650d69 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Mar 2024 18:36:29 +0100
Subject: [PATCH 14/52] upgrade tokio 1.34 => 1.36 (#7008)

tokio 1.36 has been out for a month.

Release notes don't indicate major changes.

Skimming through their issue tracker, I can't find open `C-bug` issues
that would affect us.

(My personal motivation for this is `JoinSet::try_join_next`.)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c23162971e..f937f3a372 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5810,9 +5810,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.34.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",

From 3dfae4be8d5aba629e42ba4ae69017e4b4979350 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 4 Mar 2024 19:16:07 +0000
Subject: [PATCH 15/52] upgrade mio 0.8.10 => 0.8.11 (#7009)

## Problem

`cargo deny` fails
- https://rustsec.org/advisories/RUSTSEC-2024-0019
-
https://github.com/tokio-rs/mio/security/advisories/GHSA-r8w9-5wcg-vfj7

> The vulnerability is Windows-specific, and can only happen if you are
using named pipes. Other IO resources are not affected.

## Summary of changes
- Upgrade `mio` from 0.8.10 to 0.8.11 (`cargo update -p mio`)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f937f3a372..864e5c9046 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2959,9 +2959,9 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.10"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "log",

From b7db912be6296bb2569a1162892b6d047702afbf Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Mon, 4 Mar 2024 14:28:45 -0500
Subject: [PATCH 16/52] compute_ctl: only try zenith_admin if could not
 authenticate (#6955)

## Problem

Fix https://github.com/neondatabase/neon/issues/6498

## Summary of changes

Only re-authenticate with zenith_admin if authentication fails.
Otherwise, directly return the error message.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 44 +++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a82b999cfb..da271e49cd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -774,27 +775,34 @@ impl ComputeNode {
         // but we can create a new one and grant it all privileges.
         let connstr = self.connstr.clone();
         let mut client = match Client::connect(connstr.as_str(), NoTls) {
-            Err(e) => {
-                info!(
-                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
-                    e
-                );
-                let mut zenith_admin_connstr = connstr.clone();
+            Err(e) => match e.code() {
+                Some(&SqlState::INVALID_PASSWORD)
+                | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
+                    // connect with zenith_admin if cloud_admin could not authenticate
+                    info!(
+                        "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                        e
+                    );
+                    let mut zenith_admin_connstr = connstr.clone();
 
-                zenith_admin_connstr
-                    .set_username("zenith_admin")
-                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    zenith_admin_connstr
+                        .set_username("zenith_admin")
+                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
 
-                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
-                // Disable forwarding so that users don't get a cloud_admin role
-                client.simple_query("SET neon.forward_ddl = false")?;
-                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                drop(client);
+                    let mut client =
+                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
+                    // Disable forwarding so that users don't get a cloud_admin role
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                    drop(client);
 
-                // reconnect with connstring with expected name
-                Client::connect(connstr.as_str(), NoTls)?
-            }
+                    // reconnect with connstring with expected name
+                    Client::connect(connstr.as_str(), NoTls)?
+                }
+                _ => return Err(e.into()),
+            },
             Ok(client) => client,
         };
 

From 3da410c8fee05b0cd65a5c0b83fffa3d5680cd77 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 10:03:54 +0100
Subject: [PATCH 17/52] tokio-epoll-uring: use it on the layer-creating code
 paths (#6378)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

part of #6663
See that epic for more context & related commits.

Problem
-------

Before this PR, the layer-file-creating code paths were using
VirtualFile, but under the hood these were still blocking system calls.

Generally this meant we'd stall the executor thread, unless the caller
"knew" and used the following pattern instead:

```
spawn_blocking(|| {
    Handle::block_on(async {
        VirtualFile::....().await;
    })
}).await
```

Solution
--------

This PR adopts `tokio-epoll-uring` on the layer-file-creating code paths
in pageserver.

Note that on-demand downloads still use `tokio::fs`, these will be
converted in a future PR.

Design: Avoiding Regressions With `std-fs`
------------------------------------------

If we make the VirtualFile write path truly async using
`tokio-epoll-uring`, should we then remove the `spawn_blocking` +
`Handle::block_on` usage upstack in the same commit?

No, because if we’re still using the `std-fs` io engine, we’d then block
the executor in those places where previously we were protecting us from
that through the `spawn_blocking` .

So, if we want to see benefits from `tokio-epoll-uring` on the write
path while also preserving the ability to switch between
`tokio-epoll-uring` and `std-fs` , where `std-fs` will behave identical
to what we have now, we need to ***conditionally* use `spawn_blocking +
Handle::block_on`** .

I.e., in the places where we use that know, we’ll need to make that
conditional based on the currently configured io engine.

It boils down to investigating all the places where we do
`spawn_blocking(... block_on(... VirtualFile::...))`.

Detailed [write-up of that investigation in
Notion](https://neondatabase.notion.site/Surveying-VirtualFile-write-path-usage-wrt-tokio-epoll-uring-integration-spawn_blocking-Handle-bl-5dc2270dbb764db7b2e60803f375e015?pvs=4
), made publicly accessible.

tl;dr: Preceding PRs addressed the relevant call sites:
- `metadata` file: turns out we could simply remove it (#6777, #6769,
#6775)
- `create_delta_layer()`: made sensitive to `virtual_file_io_engine` in
#6986

NB: once we are switched over to `tokio-epoll-uring` everywhere in
production, we can deprecate `std-fs`; to keep macOS support, we can use
`tokio::fs` instead. That will remove this whole headache.


Code Changes In This PR
-----------------------

- VirtualFile API changes
  - `VirtualFile::write_at`
- implement an `ioengine` operation and switch `VirtualFile::write_at`
to it
  - `VirtualFile::metadata()`
- curiously, we only use it from the layer writers' `finish()` methods
- introduce a wrapper `Metadata` enum because `std::fs::Metadata` cannot
be constructed by code outside rust std
- `VirtualFile::sync_all()` and for completeness sake, add
`VirtualFile::sync_data()`

Testing & Rollout
-----------------

Before merging this PR, we ran the CI with both io engines.

Additionally, the changes will soak in staging.

We could have a feature gate / add a new io engine
`tokio-epoll-uring-write-path` to do a gradual rollout. However, that's
not part of this PR.


Future Work
-----------

There's still some use of `std::fs` and/or `tokio::fs` for directory
namespace operations, e.g. `std::fs::rename`.

We're not addressing those in this PR, as we'll need to add the support
in tokio-epoll-uring first. Note that rename itself is usually fast if
the directory is in the kernel dentry cache, and only the fsync after
rename is slow. These fsyncs are using tokio-epoll-uring, so, the impact
should be small.
---
 pageserver/src/tenant/blob_io.rs             |  14 ++-
 pageserver/src/tenant/storage_layer/layer.rs |   1 +
 pageserver/src/tenant/timeline.rs            |  78 +++++++-------
 pageserver/src/virtual_file.rs               | 105 ++++++++++++++-----
 pageserver/src/virtual_file/io_engine.rs     |  96 +++++++++++++++--
 pageserver/src/virtual_file/metadata.rs      |  30 ++++++
 6 files changed, 246 insertions(+), 78 deletions(-)
 create mode 100644 pageserver/src/virtual_file/metadata.rs

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index ec70bdc679..0d33100ead 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,7 +12,7 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use bytes::{BufMut, BytesMut};
-use tokio_epoll_uring::{BoundedBuf, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -127,7 +127,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     /// You need to make sure that the internal buffer is empty, otherwise
     /// data will be written in wrong order.
     #[inline(always)]
-    async fn write_all_unbuffered<B: BoundedBuf>(
+    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
     ) -> (B::Buf, Result<(), Error>) {
@@ -162,7 +162,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     }
 
     /// Internal, possibly buffered, write function
-    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
             return self.write_all_unbuffered(src_buf).await;
@@ -210,7 +213,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     /// Write a blob of data. Returns the offset that it was written to,
     /// which can be used to retrieve the data later.
-    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
+    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+    ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
         let len = srcbuf.bytes_init();
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 247dd1a8e4..e14a2f22cf 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -195,6 +195,7 @@ impl Layer {
         let downloaded = resident.expect("just initialized");
 
         // if the rename works, the path is as expected
+        // TODO: sync system call
         std::fs::rename(temp_path, owner.local_path())
             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 64c324a5c8..1f811155f6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3410,44 +3410,48 @@ impl Timeline {
         frozen_layer: &Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        let span = tracing::info_span!("blocking");
-        let new_delta: ResidentLayer = tokio::task::spawn_blocking({
-            let self_clone = Arc::clone(self);
-            let frozen_layer = Arc::clone(frozen_layer);
-            let ctx = ctx.attached_child();
-            move || {
-                Handle::current().block_on(
-                    async move {
-                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
-                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                        // We just need to fsync the directory in which these inodes are linked,
-                        // which we know to be the timeline directory.
-                        //
-                        // We use fatal_err() below because the after write_to_disk returns with success,
-                        // the in-memory state of the filesystem already has the layer file in its final place,
-                        // and subsequent pageserver code could think it's durable while it really isn't.
-                        let timeline_dir =
-                            VirtualFile::open(&self_clone.conf.timeline_path(
-                                &self_clone.tenant_shard_id,
-                                &self_clone.timeline_id,
-                            ))
-                            .await
-                            .fatal_err("VirtualFile::open for timeline dir fsync");
-                        timeline_dir
-                            .sync_all()
-                            .await
-                            .fatal_err("VirtualFile::sync_all timeline dir");
-                        anyhow::Ok(new_delta)
-                    }
-                    .instrument(span),
-                )
+        let self_clone = Arc::clone(self);
+        let frozen_layer = Arc::clone(frozen_layer);
+        let ctx = ctx.attached_child();
+        let work = async move {
+            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after write_to_disk returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self_clone
+                    .conf
+                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+            anyhow::Ok(new_delta)
+        };
+        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
+        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
+        use crate::virtual_file::io_engine::IoEngine;
+        match crate::virtual_file::io_engine::get() {
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("blocking");
+                tokio::task::spawn_blocking({
+                    move || Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .context("spawn_blocking")
+                .and_then(|x| x)
             }
-        })
-        .await
-        .context("spawn_blocking")
-        .and_then(|x| x)?;
-
-        Ok(new_delta)
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => work.await,
+        }
     }
 
     async fn repartition(
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b7112108f2..6d4774cf75 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -17,20 +17,21 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
+pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
 ///
@@ -435,13 +436,25 @@ impl VirtualFile {
 
     /// Call File::sync_all() on the underlying File.
     pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.sync_all()))
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
+            res
+        })
     }
 
-    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.metadata()))
+    /// Call File::sync_data() on the underlying File.
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
+            res
+        })
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| {
+            let (_file_guard, res) = io_engine::get().metadata(file_guard).await;
+            res
+        })
     }
 
     /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -579,7 +592,7 @@ impl VirtualFile {
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at<B: BoundedBuf>(
+    pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &self,
         buf: B,
         mut offset: u64,
@@ -590,8 +603,9 @@ impl VirtualFile {
         }
         let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
-            // TODO: push `buf` further down
-            match self.write_at(&buf, offset).await {
+            let res;
+            (buf, res) = self.write_at(buf, offset).await;
+            match res {
                 Ok(0) => {
                     return (
                         Slice::into_inner(buf),
@@ -605,7 +619,7 @@ impl VirtualFile {
                     buf = buf.slice(n..);
                     offset += n as u64;
                 }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                 Err(e) => return (Slice::into_inner(buf), Err(e)),
             }
         }
@@ -616,15 +630,19 @@ impl VirtualFile {
     /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
     /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
     /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
-    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+    pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> (B::Buf, Result<usize, Error>) {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
             return (Slice::into_inner(buf.slice_full()), Ok(0));
         }
         let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
-            // TODO: push `Slice` further down
-            match self.write(&buf).await {
+            let res;
+            (buf, res) = self.write(buf).await;
+            match res {
                 Ok(0) => {
                     return (
                         Slice::into_inner(buf),
@@ -644,11 +662,18 @@ impl VirtualFile {
         (Slice::into_inner(buf), Ok(nbytes))
     }
 
-    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+    async fn write<B: IoBuf + Send>(
+        &mut self,
+        buf: Slice<B>,
+    ) -> (Slice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
-        let n = self.write_at(buf, pos).await?;
+        let (buf, res) = self.write_at(buf, pos).await;
+        let n = match res {
+            Ok(n) => n,
+            Err(e) => return (buf, Err(e)),
+        };
         self.pos += n as u64;
-        Ok(n)
+        (buf, Ok(n))
     }
 
     pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
@@ -676,16 +701,30 @@ impl VirtualFile {
         })
     }
 
-    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
-            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
-        });
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    async fn write_at<B: IoBuf + Send>(
+        &self,
+        buf: Slice<B>,
+        offset: u64,
+    ) -> (Slice<B>, Result<usize, Error>) {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+        observe_duration!(StorageIoOperation::Write, {
+            let ((_file_guard, buf), result) =
+                io_engine::get().write_at(file_guard, offset, buf).await;
+            if let Ok(size) = result {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "write",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, result)
+        })
     }
 }
 
@@ -1083,6 +1122,7 @@ mod tests {
     use rand::Rng;
     use std::future::Future;
     use std::io::Write;
+    use std::os::unix::fs::FileExt;
     use std::sync::Arc;
 
     enum MaybeVirtualFile {
@@ -1103,7 +1143,11 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
-        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &self,
+            buf: B,
+            offset: u64,
+        ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let (_buf, res) = file.write_all_at(buf, offset).await;
@@ -1124,7 +1168,10 @@ mod tests {
                 MaybeVirtualFile::File(file) => file.seek(pos),
             }
         }
-        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let (_buf, res) = file.write_all(buf).await;
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 892affa326..1a8cd9f562 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,6 +7,8 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
 
+use tokio_epoll_uring::{IoBuf, Slice};
+
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
 #[repr(u8)]
@@ -61,7 +63,8 @@ pub(super) fn init(engine_kind: IoEngineKind) {
     set(engine_kind);
 }
 
-pub(super) fn get() -> IoEngine {
+/// Longer-term, this API should only be used by [`super::VirtualFile`].
+pub(crate) fn get() -> IoEngine {
     let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
     if cfg!(test) {
         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
@@ -98,7 +101,17 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::FileGuard;
+use super::{FileGuard, Metadata};
+
+#[cfg(target_os = "linux")]
+fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+    match e {
+        tokio_epoll_uring::Error::Op(e) => e,
+        tokio_epoll_uring::Error::System(system) => {
+            std::io::Error::new(std::io::ErrorKind::Other, system)
+        }
+    }
+}
 
 impl IoEngine {
     pub(super) async fn read_at<B>(
@@ -133,16 +146,83 @@ impl IoEngine {
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_all());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fsync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_data(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_data());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fdatasync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn metadata(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<Metadata>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res =
+                    file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from));
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.statx(file_guard).await;
                 (
                     resources,
-                    res.map_err(|e| match e {
-                        tokio_epoll_uring::Error::Op(e) => e,
-                        tokio_epoll_uring::Error::System(system) => {
-                            std::io::Error::new(std::io::ErrorKind::Other, system)
-                        }
-                    }),
+                    res.map_err(epoll_uring_error_to_std).map(Metadata::from),
                 )
             }
         }
     }
+    pub(super) async fn write_at<B: IoBuf + Send>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        buf: Slice<B>,
+    ) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset));
+                ((file_guard, buf), result)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.write(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
 }
diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs
new file mode 100644
index 0000000000..f530c50988
--- /dev/null
+++ b/pageserver/src/virtual_file/metadata.rs
@@ -0,0 +1,30 @@
+use std::fs;
+
+pub enum Metadata {
+    StdFs(fs::Metadata),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(Box<tokio_epoll_uring::ops::statx::statx>),
+}
+
+#[cfg(target_os = "linux")]
+impl From<Box<tokio_epoll_uring::ops::statx::statx>> for Metadata {
+    fn from(value: Box<tokio_epoll_uring::ops::statx::statx>) -> Self {
+        Metadata::TokioEpollUring(value)
+    }
+}
+
+impl From<std::fs::Metadata> for Metadata {
+    fn from(value: std::fs::Metadata) -> Self {
+        Metadata::StdFs(value)
+    }
+}
+
+impl Metadata {
+    pub fn len(&self) -> u64 {
+        match self {
+            Metadata::StdFs(metadata) => metadata.len(),
+            #[cfg(target_os = "linux")]
+            Metadata::TokioEpollUring(statx) => statx.stx_size,
+        }
+    }
+}

From 752bf5a22f8b53a163102820d845c87bf848cb55 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 5 Mar 2024 12:14:37 +0200
Subject: [PATCH 18/52] build: clippy disallow futures::pin_mut macro (#7016)

`std` has had `pin!` macro for some time, there is no need for us to use
the older alternatives. Cannot disallow `tokio::pin` because tokio
macros use that.
---
 clippy.toml                           |  7 +++++++
 control_plane/src/pageserver.rs       |  2 +-
 libs/postgres_backend/src/lib.rs      |  4 +---
 proxy/src/serverless/sql_over_http.rs |  4 +---
 s3_scrubber/src/checks.rs             |  5 ++---
 s3_scrubber/src/garbage.rs            | 14 +++++++-------
 s3_scrubber/src/scan_metadata.rs      |  5 ++---
 safekeeper/src/wal_service.rs         |  2 +-
 8 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/clippy.toml b/clippy.toml
index d788afc84d..5f7dc66152 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -3,3 +3,10 @@ disallowed-methods = [
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
 ]
+
+disallowed-macros = [
+    # use std::pin::pin
+    "futures::pin_mut",
+    # cannot disallow this, because clippy finds used from tokio macros
+    #"tokio::pin",
+]
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 642f153f2d..7d0c07a938 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -605,7 +605,7 @@ impl PageServerNode {
                 eprintln!("connection error: {}", e);
             }
         });
-        tokio::pin!(client);
+        let client = std::pin::pin!(client);
 
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 73d25619c3..260018ad89 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -6,7 +6,6 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
-use futures::pin_mut;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
@@ -378,8 +377,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<(), std::io::Error>> {
-        let flush_fut = self.flush();
-        pin_mut!(flush_fut);
+        let flush_fut = std::pin::pin!(self.flush());
         flush_fut.poll(cx)
     }
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7f51ba82cc..74af985211 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 
 use anyhow::bail;
-use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::header;
@@ -531,13 +530,12 @@ async fn query_to_json<T: GenericClient>(
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
     info!("executing query");
     let query_params = data.params;
-    let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
     info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
     while let Some(row) = row_stream.next().await {
         let row = row?;
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 7b9f96dce3..7c0f699958 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -11,7 +11,7 @@ use utils::id::TimelineId;
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
-use futures_util::{pin_mut, StreamExt};
+use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
@@ -285,8 +285,7 @@ pub(crate) async fn list_timeline_blobs(
     let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
     let mut initdb_archive: bool = false;
 
-    let stream = stream_listing(s3_client, &timeline_dir_target);
-    pin_mut!(stream);
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let obj = obj?;
         let key = obj.key();
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 93bb115883..7a08dffc66 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::{
     types::{Delete, ObjectIdentifier},
     Client,
 };
-use futures_util::{pin_mut, TryStreamExt};
+use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
@@ -199,12 +199,12 @@ async fn find_garbage_inner(
             }
         }
     });
-    let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut tenants_checked =
+        std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Process the results of Tenant checks.  If a Tenant is garbage, it goes into
     // the `GarbageList`.  Else it goes into `active_tenants` for more detailed timeline
     // checks if they are enabled by the `depth` parameter.
-    pin_mut!(tenants_checked);
     let mut garbage = GarbageList::new(node_kind, bucket_config);
     let mut active_tenants: Vec<TenantShardId> = vec![];
     let mut counter = 0;
@@ -267,10 +267,10 @@ async fn find_garbage_inner(
                 .map(|r| (ttid, r))
         }
     });
-    let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut timelines_checked =
+        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Update the GarbageList with any timelines which appear not to exist.
-    pin_mut!(timelines_checked);
     while let Some(result) = timelines_checked.next().await {
         let (ttid, console_result) = result?;
         if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
@@ -425,9 +425,9 @@ pub async fn purge_garbage(
             }
         }
     });
-    let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY);
+    let mut get_objects_results =
+        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
 
-    pin_mut!(get_objects_results);
     let mut objects_to_delete = Vec::new();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 4b63bb3884..6ff9783875 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -7,7 +7,7 @@ use crate::checks::{
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
-use futures_util::{pin_mut, StreamExt, TryStreamExt};
+use futures_util::{StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
@@ -226,7 +226,7 @@ pub async fn scan_metadata(
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffered(CONCURRENCY);
+    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
     // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
@@ -309,7 +309,6 @@ pub async fn scan_metadata(
     // all results for the same tenant will be adjacent.  We accumulate these,
     // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
-    pin_mut!(timelines);
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index bceaad1e16..4a97eb3993 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -68,7 +68,7 @@ async fn handle_socket(
     // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
     // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
     // shouldn't be moved.
-    tokio::pin!(socket);
+    let socket = std::pin::pin!(socket);
 
     let traffic_metrics = TrafficMetrics::new();
     if let Some(current_az) = conf.availability_zone.as_deref() {

From f3e4f85e65a9b6fa23a28893676d341a909bae51 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 12:09:13 +0100
Subject: [PATCH 19/52] layer file download: final rename: fix durability
 (#6991)

Before this PR, the layer file download code would fsync the inode after
rename instead of the timeline directory. That is not in line with what
a comment further up says we're doing, and it's obviously not achieving
the goal of making the rename durable.

part of https://github.com/neondatabase/neon/issues/6663
---
 .../tenant/remote_timeline_client/download.rs | 28 +++++++++++++------
 pageserver/src/virtual_file/io_engine.rs      | 26 +++++++++++++++++
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 167e18a829..6fff6e78e2 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -14,14 +14,14 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::{backoff, crashsafe};
+use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
-use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -50,9 +50,8 @@ pub async fn download_layer_file<'a>(
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
-    let local_path = conf
-        .timeline_path(&tenant_shard_id, &timeline_id)
-        .join(layer_file_name.file_name());
+    let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
+    let local_path = timeline_path.join(layer_file_name.file_name());
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
@@ -149,10 +148,21 @@ pub async fn download_layer_file<'a>(
         .with_context(|| format!("rename download layer file to {local_path}"))
         .map_err(DownloadError::Other)?;
 
-    crashsafe::fsync_async(&local_path)
-        .await
-        .with_context(|| format!("fsync layer file {local_path}"))
-        .map_err(DownloadError::Other)?;
+    // We use fatal_err() below because the after the rename above,
+    // the in-memory state of the filesystem already has the layer file in its final place,
+    // and subsequent pageserver code could think it's durable while it really isn't.
+    let work = async move {
+        let timeline_dir = VirtualFile::open(&timeline_path)
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+        timeline_dir
+            .sync_all()
+            .await
+            .fatal_err("VirtualFile::sync_all timeline dir");
+    };
+    crate::virtual_file::io_engine::get()
+        .spawn_blocking_and_block_on_if_std(work)
+        .await;
 
     tracing::debug!("download complete: {local_path}");
 
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 1a8cd9f562..5fef826477 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -8,6 +8,7 @@
 //! Then use [`get`] and  [`super::OpenOptions`].
 
 use tokio_epoll_uring::{IoBuf, Slice};
+use tracing::Instrument;
 
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
@@ -225,4 +226,29 @@ impl IoEngine {
             }
         }
     }
+
+    /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`],
+    /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured
+    /// whereas before the switch to [`super::io_engine`], that wasn't the case.
+    /// This method helps avoid such a regression.
+    ///
+    /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen.
+    pub(crate) async fn spawn_blocking_and_block_on_if_std<Fut, R>(&self, work: Fut) -> R
+    where
+        Fut: 'static + Send + std::future::Future<Output = R>,
+        R: 'static + Send,
+    {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("spawn_blocking_block_on_if_std");
+                tokio::task::spawn_blocking({
+                    move || tokio::runtime::Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .expect("failed to join blocking code most likely it panicked, panicking as well")
+            }
+            IoEngine::TokioEpollUring => work.await,
+        }
+    }
 }

From ae8468f97e4783474940a568379bbac6c70a29c9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 13:30:43 +0000
Subject: [PATCH 20/52] pageserver: fix AUX key vectored get validation (#7018)

## Problem
The value reconstruct of AUX_FILES_KEY from records is not deterministic
since it uses a hash map under the hood. This caused vectored get validation
failures when enabled in staging.

## Summary of changes
Deserialise AUX_FILES_KEY blobs comparing. All other keys should
reconstruct deterministically, so we simply compare the blobs.
---
 pageserver/src/pgdatadir_mapping.rs |  2 +-
 pageserver/src/tenant/timeline.rs   | 41 +++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7be08f86b1..628aeb5a28 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1677,7 +1677,7 @@ struct RelDirectory {
     rels: HashSet<(Oid, u8)>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Default)]
+#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
 pub(crate) struct AuxFilesDirectory {
     pub(crate) files: HashMap<String, Bytes>,
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1f811155f6..309ec2e829 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,6 +17,7 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    key::AUX_FILES_KEY,
     keyspace::KeySpaceAccum,
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -891,8 +892,7 @@ impl Timeline {
                     assert_eq!(seq_key, vec_key);
                     match (seq_res, vec_res) {
                         (Ok(seq_blob), Ok(vec_blob)) => {
-                            assert_eq!(seq_blob, vec_blob,
-                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
                         },
                         (Err(err), Ok(_)) => {
                             panic!(
@@ -911,6 +911,43 @@ impl Timeline {
         }
     }
 
+    fn validate_key_equivalence(
+        key: &Key,
+        keyspace: &KeySpace,
+        lsn: Lsn,
+        seq: &Bytes,
+        vec: &Bytes,
+    ) {
+        use utils::bin_ser::BeSer;
+
+        if *key == AUX_FILES_KEY {
+            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
+            // since it uses a hash map under the hood. Hence, deserialise both results
+            // before comparing.
+            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
+            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
+            match (&seq_aux_dir_res, &vec_aux_dir_res) {
+                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
+                    assert_eq!(
+                        seq_aux_dir, vec_aux_dir,
+                        "Mismatch for key {} - keyspace={:?} lsn={}",
+                        key, keyspace, lsn
+                    );
+                }
+                (Err(_), Err(_)) => {}
+                _ => {
+                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
+                }
+            }
+        } else {
+            // All other keys should reconstruct deterministically, so we simply compare the blobs.
+            assert_eq!(
+                seq, vec,
+                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
+            );
+        }
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last

From 9dec65b75b5262c63d89ecaaf85a2dfb4d5e84f1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 13:35:45 +0000
Subject: [PATCH 21/52] pageserver: fix vectored read path delta layer index
 traversal (#7001)

## Problem
Last weeks enablement of vectored get generated a number of panics.
From them, I diagnosed two issues in the delta layer index traversal
logic
1. The `key >= range.start && lsn >= lsn_range.start`
was too aggressive. Lsns are not monotonically increasing in the delta
layer index (keys are though), so we cannot assert on them.
2. Lsns greater or equal to `lsn_range.end` were not skipped. This
caused the query to consider records newer than the request Lsn.

## Summary of changes
* Fix the issues mentioned above inline
* Refactor the layer traversal logic to make it unit testable
* Add unit test which reproduces the failure modes listed above.
---
 pageserver/src/tenant/disk_btree.rs           |  95 ++++++-
 .../src/tenant/storage_layer/delta_layer.rs   | 257 ++++++++++++++----
 .../src/tenant/storage_layer/image_layer.rs   |  44 +--
 pageserver/src/tenant/vectored_blob_io.rs     |  12 +-
 4 files changed, 322 insertions(+), 86 deletions(-)

diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index ca30b0ac4f..6d85d1e60e 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -18,10 +18,19 @@
 //! - An Iterator interface would be more convenient for the callers than the
 //!   'visit' function
 //!
+use async_stream::try_stream;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use std::{cmp::Ordering, io, result};
+use futures::Stream;
+use hex;
+use std::{
+    cmp::Ordering,
+    io,
+    iter::Rev,
+    ops::{Range, RangeInclusive},
+    result,
+};
 use thiserror::Error;
 use tracing::error;
 
@@ -250,6 +259,90 @@ where
         Ok(result)
     }
 
+    /// Return a stream which yields all key, value pairs from the index
+    /// starting from the first key greater or equal to `start_key`.
+    ///
+    /// Note that this is a copy of [`Self::visit`].
+    /// TODO: Once the sequential read path is removed this will become
+    /// the only index traversal method.
+    pub fn get_stream_from<'a>(
+        &'a self,
+        start_key: &'a [u8; L],
+        ctx: &'a RequestContext,
+    ) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
+        try_stream! {
+            let mut stack = Vec::new();
+            stack.push((self.root_blk, None));
+            let block_cursor = self.reader.block_cursor();
+            while let Some((node_blknum, opt_iter)) = stack.pop() {
+                // Locate the node.
+                let node_buf = block_cursor
+                    .read_blk(self.start_blk + node_blknum, ctx)
+                    .await?;
+
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let prefix_len = node.prefix_len as usize;
+                let suffix_len = node.suffix_len as usize;
+
+                assert!(node.num_children > 0);
+
+                let mut keybuf = Vec::new();
+                keybuf.extend(node.prefix);
+                keybuf.resize(prefix_len + suffix_len, 0);
+
+                let mut iter: Either<Range<usize>, Rev<RangeInclusive<usize>>> = if let Some(iter) = opt_iter {
+                    iter
+                } else {
+                    // Locate the first match
+                    let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) {
+                        Ok(idx) => idx,
+                        Err(idx) => {
+                            if node.level == 0 {
+                                // Imagine that the node contains the following keys:
+                                //
+                                // 1
+                                // 3  <-- idx
+                                // 5
+                                //
+                                // If the search key is '2' and there is exact match,
+                                // the binary search would return the index of key
+                                // '3'. That's cool, '3' is the first key to return.
+                                idx
+                            } else {
+                                // This is an internal page, so each key represents a lower
+                                // bound for what's in the child page. If there is no exact
+                                // match, we have to return the *previous* entry.
+                                //
+                                // 1  <-- return this
+                                // 3  <-- idx
+                                // 5
+                                idx.saturating_sub(1)
+                            }
+                        }
+                    };
+                    Either::Left(idx..node.num_children.into())
+                };
+
+                // idx points to the first match now. Keep going from there
+                while let Some(idx) = iter.next() {
+                    let key_off = idx * suffix_len;
+                    let suffix = &node.keys[key_off..key_off + suffix_len];
+                    keybuf[prefix_len..].copy_from_slice(suffix);
+                    let value = node.value(idx);
+                    #[allow(clippy::collapsible_if)]
+                    if node.level == 0 {
+                        // leaf
+                        yield (keybuf.clone(), value.to_u64());
+                    } else {
+                        stack.push((node_blknum, Some(iter)));
+                        stack.push((value.to_blknum(), None));
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
     ///
     /// Scan the tree, starting from 'search_key', in the given direction. 'visitor'
     /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 5eaf1cc1ce..b7132ee3bf 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -46,6 +46,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -847,10 +848,33 @@ impl DeltaLayerInner {
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
-        let reads = self
-            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
-            .await
-            .map_err(GetVectoredError::Other)?;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+
+        let reads = Self::plan_reads(
+            keyspace,
+            lsn_range,
+            data_end_offset,
+            index_reader,
+            planner,
+            reconstruct_state,
+            ctx,
+        )
+        .await
+        .map_err(GetVectoredError::Other)?;
 
         self.do_reads_and_update_state(reads, reconstruct_state)
             .await;
@@ -858,73 +882,64 @@ impl DeltaLayerInner {
         Ok(())
     }
 
-    async fn plan_reads(
-        &self,
+    async fn plan_reads<Reader>(
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
+        data_end_offset: u64,
+        index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
+        mut planner: VectoredReadPlanner,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<VectoredRead>> {
-        let mut planner = VectoredReadPlanner::new(
-            self.max_vectored_read_bytes
-                .expect("Layer is loaded with max vectored bytes config")
-                .0
-                .into(),
-        );
-
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
+    ) -> anyhow::Result<Vec<VectoredRead>>
+    where
+        Reader: BlockReader,
+    {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+            .build();
 
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
 
             let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
-            tree_reader
-                .visit(
-                    &start_key.0,
-                    VisitDirection::Forwards,
-                    |raw_key, value| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-                        let blob_ref = BlobRef(value);
+            let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);
 
-                        assert!(key >= range.start && lsn >= lsn_range.start);
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, value) = index_entry?;
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
+                let blob_ref = BlobRef(value);
 
-                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
-                        let flag = {
-                            if cached_lsn >= Some(lsn) {
-                                BlobFlag::Ignore
-                            } else if blob_ref.will_init() {
-                                BlobFlag::Replaces
-                            } else {
-                                BlobFlag::None
-                            }
-                        };
+                // Lsns are not monotonically increasing across keys, so we don't assert on them.
+                assert!(key >= range.start);
 
-                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
-                            planner.handle_range_end(blob_ref.pos());
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, lsn, blob_ref.pos(), flag);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| anyhow!(err))?;
+                let outside_lsn_range = !lsn_range.contains(&lsn);
+                let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
+
+                let flag = {
+                    if outside_lsn_range || below_cached_lsn {
+                        BlobFlag::Ignore
+                    } else if blob_ref.will_init() {
+                        BlobFlag::ReplaceAll
+                    } else {
+                        // Usual path: add blob to the read
+                        BlobFlag::None
+                    }
+                };
+
+                if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                    planner.handle_range_end(blob_ref.pos());
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, lsn, blob_ref.pos(), flag);
+                }
+            }
 
             if !range_end_handled {
-                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
-                tracing::info!("Handling range end fallback at {}", payload_end);
-                planner.handle_range_end(payload_end);
+                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                planner.handle_range_end(data_end_offset);
             }
         }
 
@@ -1190,3 +1205,131 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
         self.size
     }
 }
+
+#[cfg(test)]
+mod test {
+    use std::collections::BTreeMap;
+
+    use super::*;
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+    };
+
+    /// Construct an index for a fictional delta layer and and then
+    /// traverse in order to plan vectored reads for a query. Finally,
+    /// verify that the traversal fed the right index key and value
+    /// pairs into the planner.
+    #[tokio::test]
+    async fn test_delta_layer_index_traversal() {
+        let base_key = Key {
+            field1: 0,
+            field2: 1663,
+            field3: 12972,
+            field4: 16396,
+            field5: 0,
+            field6: 246080,
+        };
+
+        // Populate the index with some entries
+        let entries: BTreeMap<Key, Vec<Lsn>> = BTreeMap::from([
+            (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]),
+            (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]),
+        ]);
+
+        let mut disk = TestDisk::default();
+        let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk);
+
+        let mut disk_offset = 0;
+        for (key, lsns) in &entries {
+            for lsn in lsns {
+                let index_key = DeltaKey::from_key_lsn(key, *lsn);
+                let blob_ref = BlobRef::new(disk_offset, false);
+                writer
+                    .append(&index_key.0, blob_ref.0)
+                    .expect("In memory disk append should never fail");
+
+                disk_offset += 1;
+            }
+        }
+
+        // Prepare all the arguments for the call into `plan_reads` below
+        let (root_offset, _writer) = writer
+            .finish()
+            .expect("In memory disk finish should never fail");
+        let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
+        let planner = VectoredReadPlanner::new(100);
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let keyspace = KeySpace {
+            ranges: vec![
+                base_key..base_key.add(3),
+                base_key.add(3)..base_key.add(100),
+            ],
+        };
+        let lsn_range = Lsn(2)..Lsn(40);
+
+        // Plan and validate
+        let vectored_reads = DeltaLayerInner::plan_reads(
+            keyspace.clone(),
+            lsn_range.clone(),
+            disk_offset,
+            reader,
+            planner,
+            &mut reconstruct_state,
+            &ctx,
+        )
+        .await
+        .expect("Read planning should not fail");
+
+        validate(keyspace, lsn_range, vectored_reads, entries);
+    }
+
+    fn validate(
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        vectored_reads: Vec<VectoredRead>,
+        index_entries: BTreeMap<Key, Vec<Lsn>>,
+    ) {
+        #[derive(Debug, PartialEq, Eq)]
+        struct BlobSpec {
+            key: Key,
+            lsn: Lsn,
+            at: u64,
+        }
+
+        let mut planned_blobs = Vec::new();
+        for read in vectored_reads {
+            for (at, meta) in read.blobs_at.as_slice() {
+                planned_blobs.push(BlobSpec {
+                    key: meta.key,
+                    lsn: meta.lsn,
+                    at: *at,
+                });
+            }
+        }
+
+        let mut expected_blobs = Vec::new();
+        let mut disk_offset = 0;
+        for (key, lsns) in index_entries {
+            for lsn in lsns {
+                let key_included = keyspace.ranges.iter().any(|range| range.contains(&key));
+                let lsn_included = lsn_range.contains(&lsn);
+
+                if key_included && lsn_included {
+                    expected_blobs.push(BlobSpec {
+                        key,
+                        lsn,
+                        at: disk_offset,
+                    });
+                }
+
+                disk_offset += 1;
+            }
+        }
+
+        assert_eq!(planned_blobs, expected_blobs);
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 56cfaeda15..14c79e413c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
+use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -54,6 +55,7 @@ use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
+use tokio_stream::StreamExt;
 use tracing::*;
 
 use utils::{
@@ -488,35 +490,33 @@ impl ImageLayerInner {
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
 
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+            .build();
+
         for range in keyspace.ranges.iter() {
             let mut range_end_handled = false;
 
             let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
             range.start.write_to_byte_slice(&mut search_key);
 
-            tree_reader
-                .visit(
-                    &search_key,
-                    VisitDirection::Forwards,
-                    |raw_key, offset| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        assert!(key >= range.start);
+            let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);
 
-                        if key >= range.end {
-                            planner.handle_range_end(offset);
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, self.lsn, offset, BlobFlag::None);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, offset) = index_entry?;
+
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                assert!(key >= range.start);
+
+                if key >= range.end {
+                    planner.handle_range_end(offset);
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                }
+            }
 
             if !range_end_handled {
                 let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index a8d9649d36..805f70b23b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -128,7 +128,7 @@ impl VectoredReadBuilder {
 pub enum BlobFlag {
     None,
     Ignore,
-    Replaces,
+    ReplaceAll,
 }
 
 /// Planner for vectored blob reads.
@@ -170,7 +170,7 @@ impl VectoredReadPlanner {
     /// incorrect data to the user.
     ///
     /// The `flag` argument has two interesting values:
-    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
     /// This is used for WAL records that `will_init`.
     /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
     /// if the blob is cached.
@@ -204,7 +204,7 @@ impl VectoredReadPlanner {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.push((lsn, start_offset, end_offset));
             }
-            BlobFlag::Replaces => {
+            BlobFlag::ReplaceAll => {
                 let blobs_for_key = self.blobs.entry(key).or_default();
                 blobs_for_key.clear();
                 blobs_for_key.push((lsn, start_offset, end_offset));
@@ -411,10 +411,10 @@ mod tests {
         let blob_descriptions = vec![
             (first_key, lsn, 0, BlobFlag::None),    // First in read 1
             (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
             (second_key, lsn, 3 * 1024, BlobFlag::None),
-            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
-            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
         ];
 
         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];

From 270d3be507643f068120b52838c497f6c1b45b61 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 14:44:00 +0100
Subject: [PATCH 22/52] feat(per-tenant throttling): exclude throttled time
 from page_service metrics + regression test (#6953)

part of https://github.com/neondatabase/neon/issues/5899

Problem
-------

Before this PR, the time spent waiting on the throttle was charged
towards the higher-level page_service metrics, i.e.,
`pageserver_smgr_query_seconds`.
The metrics are the foundation of internal SLIs / SLOs.
A throttled tenant would cause the SLI to degrade / SLO alerts to fire.

Changes
-------


- don't charge time spent in throttle towards the page_service metrics
- record time spent in throttle in RequestContext and subtract it from
the elapsed time
- this works because the page_service path doesn't create child context,
so, all the throttle time is recorded in the parent
- it's quite brittle and will break if we ever decide to spawn child
tasks that need child RequestContexts, which would have separate
instances of the `micros_spent_throttled` counter.
- however, let's punt that to a more general refactoring of
RequestContext
- add a test case that ensures that
- throttling happens for getpage requests; this aspect of the test
passed before this PR
- throttling delays aren't charged towards the page_service metrics;
this aspect of the test only passes with this PR
- drive-by: make the throttle log message `info!`, it's an expected
condition

Performance
-----------

I took the same measurements as in #6706 , no meaningful change in CPU
overhead.

Future Work
-----------

This PR enables us to experiment with the throttle for select tenants
without affecting the SLI metrics / triggering SLO alerts.

Before declaring this feature done, we need more work to happen,
specifically:

- decide on whether we want to retain the flexibility of throttling any
`Timeline::get` call, filtered by TaskKind
- versus: separate throttles for each page_service endpoint, potentially
with separate config options
- the trouble here is that this decision implies changes to the
TenantConfig, so, if we start using the current config style now, then
decide to switch to a different config, it'll be a breaking change

Nice-to-haves but probably not worth the time right now:

- Equivalent tests to ensure the throttle applies to all other
page_service handlers.
---
 pageserver/src/context.rs                     |   7 +-
 pageserver/src/context/optional_counter.rs    | 101 +++++++++++++++
 pageserver/src/metrics.rs                     |  68 +++++++++-
 pageserver/src/page_service.rs                |  10 +-
 pageserver/src/tenant/tasks.rs                |   2 +-
 pageserver/src/tenant/throttle.rs             |  17 ++-
 .../test_pageserver_getpage_throttle.py       | 118 ++++++++++++++++++
 7 files changed, 308 insertions(+), 15 deletions(-)
 create mode 100644 pageserver/src/context/optional_counter.rs
 create mode 100644 test_runner/regress/test_pageserver_getpage_throttle.py

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index ee331ea154..86d0390c30 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,13 +88,16 @@
 
 use crate::task_mgr::TaskKind;
 
+pub(crate) mod optional_counter;
+
 // The main structure of this module, see module-level comment.
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
+    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }
 
 /// The kind of access to the page cache.
@@ -150,6 +153,7 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
+                micros_spent_throttled: Default::default(),
             },
         }
     }
@@ -163,6 +167,7 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
+                micros_spent_throttled: Default::default(),
             },
         }
     }
diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs
new file mode 100644
index 0000000000..100c649f18
--- /dev/null
+++ b/pageserver/src/context/optional_counter.rs
@@ -0,0 +1,101 @@
+use std::{
+    sync::atomic::{AtomicU32, Ordering},
+    time::Duration,
+};
+
+#[derive(Debug)]
+pub struct CounterU32 {
+    inner: AtomicU32,
+}
+impl Default for CounterU32 {
+    fn default() -> Self {
+        Self {
+            inner: AtomicU32::new(u32::MAX),
+        }
+    }
+}
+impl CounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        match self
+            .inner
+            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
+        {
+            Ok(_) => Ok(()),
+            Err(_) => Err("open() called on clsoed state"),
+        }
+    }
+    pub fn close(&self) -> Result<u32, &'static str> {
+        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
+            u32::MAX => Err("close() called on closed state"),
+            x => Ok(x),
+        }
+    }
+
+    pub fn add(&self, count: u32) -> Result<(), &'static str> {
+        if count == 0 {
+            return Ok(());
+        }
+        let mut had_err = None;
+        self.inner
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
+                u32::MAX => {
+                    had_err = Some("add() called on closed state");
+                    None
+                }
+                x => {
+                    let (new, overflowed) = x.overflowing_add(count);
+                    if new == u32::MAX || overflowed {
+                        had_err = Some("add() overflowed the counter");
+                        None
+                    } else {
+                        Some(new)
+                    }
+                }
+            })
+            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
+            .map(|_| ())
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct MicroSecondsCounterU32 {
+    inner: CounterU32,
+}
+
+impl MicroSecondsCounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        self.inner.open()
+    }
+    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
+        match duration.as_micros().try_into() {
+            Ok(x) => self.inner.add(x),
+            Err(_) => Err("add(): duration conversion error"),
+        }
+    }
+    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
+        let val = self.inner.close()?;
+        let val = Duration::from_micros(val as u64);
+        let subbed = match from.checked_sub(val) {
+            Some(v) => v,
+            None => return Err("Duration::checked_sub"),
+        };
+        Ok(subbed)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_basic() {
+        let counter = MicroSecondsCounterU32::default();
+        counter.open().unwrap();
+        counter.add(Duration::from_micros(23)).unwrap();
+        let res = counter
+            .close_and_checked_sub_from(Duration::from_micros(42))
+            .unwrap();
+        assert_eq!(res, Duration::from_micros(42 - 23));
+    }
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ce5561b431..ee62ee0367 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,6 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
+use tracing::warn;
 use utils::id::TimelineId;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1005,15 +1006,39 @@ impl GlobalAndPerTimelineHistogram {
     }
 }
 
-struct GlobalAndPerTimelineHistogramTimer<'a> {
+struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     h: &'a GlobalAndPerTimelineHistogram,
+    ctx: &'c RequestContext,
     start: std::time::Instant,
+    op: SmgrQueryType,
 }
 
-impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     fn drop(&mut self) {
         let elapsed = self.start.elapsed();
-        self.h.observe(elapsed.as_secs_f64());
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(res) => res,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
+        self.h.observe(ex_throttled.as_secs_f64());
     }
 }
 
@@ -1025,6 +1050,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
     strum_macros::EnumCount,
     strum_macros::EnumIter,
     strum_macros::FromRepr,
+    enum_map::Enum,
 )]
 #[strum(serialize_all = "snake_case")]
 pub enum SmgrQueryType {
@@ -1130,11 +1156,35 @@ impl SmgrQueryTimePerTimeline {
         });
         Self { metrics }
     }
-    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+    pub(crate) fn start_timer<'c: 'a, 'a>(
+        &'a self,
+        op: SmgrQueryType,
+        ctx: &'c RequestContext,
+    ) -> impl Drop + '_ {
         let metric = &self.metrics[op as usize];
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[op];
+                rate_limit.call(|| {
+                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
         GlobalAndPerTimelineHistogramTimer {
             h: metric,
-            start: std::time::Instant::now(),
+            ctx,
+            start,
+            op,
         }
     }
 }
@@ -1145,6 +1195,11 @@ mod smgr_query_time_tests {
     use strum::IntoEnumIterator;
     use utils::id::{TenantId, TimelineId};
 
+    use crate::{
+        context::{DownloadBehavior, RequestContext},
+        task_mgr::TaskKind,
+    };
+
     // Regression test, we used hard-coded string constants before using an enum.
     #[test]
     fn op_label_name() {
@@ -1193,7 +1248,8 @@ mod smgr_query_time_tests {
             let (pre_global, pre_per_tenant_timeline) = get_counts();
             assert_eq!(pre_per_tenant_timeline, 0);
 
-            let timer = metrics.start_timer(*op);
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+            let timer = metrics.start_timer(*op, &ctx);
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 689bc5cb3c..dacee41e6e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -910,7 +910,7 @@ impl PageServerHandler {
         let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists);
+            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -938,7 +938,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize);
+            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -966,7 +966,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize);
+            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -1144,7 +1144,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
@@ -1172,7 +1172,7 @@ impl PageServerHandler {
 
         let _timer = timeline
             .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 45ce6c9381..57c3edcddd 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -217,7 +217,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
                 let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                 let delta = now - prev;
-                warn!(
+                info!(
                     n_seconds=%format_args!("{:.3}",
                     delta.as_secs_f64()),
                     count_accounted,
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 6894a88b93..280773e9c3 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
     str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
-        Arc,
+        Arc, Mutex,
     },
     time::{Duration, Instant},
 };
 
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::error;
+use tracing::{error, warn};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
 
@@ -157,6 +157,19 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
+            match ctx.micros_spent_throttled.add(wait_time) {
+                Ok(res) => res,
+                Err(error) => {
+                    use once_cell::sync::Lazy;
+                    use utils::rate_limit::RateLimit;
+                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.call(move || {
+                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
+                    });
+                }
+            }
         }
     }
 }
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
new file mode 100644
index 0000000000..42cc28efee
--- /dev/null
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -0,0 +1,118 @@
+import json
+import uuid
+
+from anyio import Path
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+
+    env.pageserver.tenant_detach(env.initial_tenant)
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    rate_limit_rps = 100
+    compaction_period = 5
+    env.pageserver.tenant_create(
+        tenant_id,
+        conf={
+            "compaction_period": f"{compaction_period}s",
+            "timeline_get_throttle": {
+                "task_kinds": ["PageRequestHandler"],
+                "initial": 0,
+                "refill_interval": "100ms",
+                "refill_amount": int(rate_limit_rps / 10),
+                "max": int(rate_limit_rps / 10),
+                "fair": True,
+            },
+        },
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id)
+
+    def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int):
+        cmd = [
+            str(env.neon_binpath / "pagebench"),
+            "get-page-latest-lsn",
+            "--mgmt-api-endpoint",
+            ps_http.base_url,
+            "--page-service-connstring",
+            env.pageserver.connstr(password=None),
+            "--runtime",
+            f"{duration_secs}s",
+            f"{tenant_id}/{timeline_id}",
+        ]
+
+        basepath = pg_bin.run_capture(cmd, with_command_header=False)
+        results_path = Path(basepath + ".stdout")
+        log.info(f"Benchmark results at: {results_path}")
+
+        with open(results_path, "r") as f:
+            results = json.load(f)
+        log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+        return int(results["total"]["request_count"])
+
+    log.info("warmup / make sure metrics are present")
+    run_pagebench_at_max_speed_and_get_total_requests_completed(2)
+    metrics_query = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+        "smgr_query_type": "get_page_at_lsn",
+    }
+    metric_name = "pageserver_smgr_query_seconds_sum"
+    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_pre is not None
+
+    marker = uuid.uuid4().hex
+    ps_http.post_tracing_event("info", marker)
+    _, marker_offset = wait_until(
+        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
+    )
+
+    log.info("run pagebench")
+    duration_secs = 10
+    actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
+
+    log.info("validate the client is capped at the configured rps limit")
+    expect_ncompleted = duration_secs * rate_limit_rps
+    delta_abs = abs(expect_ncompleted - actual_ncompleted)
+    threshold = 0.05 * expect_ncompleted
+    assert (
+        threshold / rate_limit_rps < 0.1 * duration_secs
+    ), "test self-test: unrealistic expecations regarding precision in this test"
+    assert (
+        delta_abs < 0.05 * expect_ncompleted
+    ), "the throttling deviates more than 5percent from the expectation"
+
+    log.info("validate that we logged the throttling")
+
+    wait_until(
+        10,
+        compaction_period / 10,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
+            offset=marker_offset,
+        ),
+    )
+
+    log.info("validate that the metric doesn't include throttle wait time")
+    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_post is not None
+    actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+
+    assert (
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"

From bdbb2f4afc8c02620b45d52fecd71fdeb848a3c9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:02:51 +0400
Subject: [PATCH 23/52] proxy: report redis broken message metric (#7021)

## Problem

Not really a problem. Improving visibility around redis communication.

## Summary of changes

Added metric on the number of broken messages.
---
 proxy/src/metrics.rs             | 9 +++++++++
 proxy/src/redis/notifications.rs | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 66031f5eb2..2464b1e611 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -303,3 +303,12 @@ pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_redis_errors_total",
+        "Number of errors by a given classification",
+        &["channel"],
+    )
+    .unwrap()
+});
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index b8297a206c..6ae848c0d2 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,6 +10,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
+    metrics::REDIS_BROKEN_MESSAGES,
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -115,6 +116,9 @@ impl<
         let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
+                REDIS_BROKEN_MESSAGES
+                    .with_label_values(&[msg.get_channel_name()])
+                    .inc();
                 tracing::error!("broken message: {e}");
                 return Ok(());
             }

From b036c32262871a0942211c4fba6a7099cfacacd7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Tue, 5 Mar 2024 10:03:44 -0500
Subject: [PATCH 24/52] fix -Wmissing-prototypes for neon extension (#7010)

## Problem

ref https://github.com/neondatabase/neon/issues/6188

## Summary of changes

This pull request fixes `-Wmissing-prototypes` for the neon extension.
Note that (1) the gcc version in CI and macOS is different, therefore
some of the warning does not get reported when developing the neon
extension locally. (2) the CI env variable `COPT = -Werror` does not get
passed into the docker build process, therefore warnings are not treated
as errors on CI.


https://github.com/neondatabase/neon/blob/e62baa97041e10ce45772b3724e24e679a650d69/.github/workflows/build_and_test.yml#L22

There will be follow-up pull requests on solving other warnings. By the
way, I did not figure out the default compile parameters in the CI env,
and therefore this pull request is tested by manually adding
`-Wmissing-prototypes` into the `COPT`.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon/control_plane_connector.c | 11 ++++++-----
 pgxn/neon/control_plane_connector.h |  2 +-
 pgxn/neon/extension_server.c        |  1 +
 pgxn/neon/extension_server.h        | 17 +++++++++++++++++
 pgxn/neon/neon.c                    |  1 +
 pgxn/neon/neon.h                    |  3 +--
 pgxn/neon/neon_utils.c              |  3 ++-
 pgxn/neon/neon_utils.h              |  2 +-
 pgxn/neon/walproposer.c             |  4 ++--
 pgxn/neon/walproposer_pg.c          |  2 +-
 10 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 pgxn/neon/extension_server.h

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 00a582d718..93252e6b29 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,6 +35,7 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"
 
+#include "control_plane_connector.h"
 #include "neon_utils.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
@@ -113,6 +114,8 @@ ConstructDeltaMessage()
 	if (RootTable.db_table)
 	{
 		JsonbValue	dbs;
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
 
 		dbs.type = jbvString;
 		dbs.val.string.val = "dbs";
@@ -120,9 +123,6 @@ ConstructDeltaMessage()
 		pushJsonbValue(&state, WJB_KEY, &dbs);
 		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
 
-		HASH_SEQ_STATUS status;
-		DbEntry    *entry;
-
 		hash_seq_init(&status, RootTable.db_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -168,8 +168,9 @@ ConstructDeltaMessage()
 #else
 				const char *logdetail;
 #endif
+				char	   *encrypted_password;
 				PushKeyValue(&state, "password", (char *) entry->password);
-				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
+				encrypted_password = get_role_password(entry->name, &logdetail);
 
 				if (encrypted_password)
 				{
@@ -831,7 +832,7 @@ NeonProcessUtility(
 	}
 }
 
-extern void
+void
 InitControlPlaneConnector()
 {
 	PreviousProcessUtilityHook = ProcessUtility_hook;
diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h
index 12d6a97562..7eed449200 100644
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -1,6 +1,6 @@
 #ifndef CONTROL_PLANE_CONNECTOR_H
 #define CONTROL_PLANE_CONNECTOR_H
 
-void		InitControlPlaneConnector();
+void		InitControlPlaneConnector(void);
 
 #endif
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index 039405e2cd..1329e2d17b 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,7 @@
 
 #include "utils/guc.h"
 
+#include "extension_server.h" 
 #include "neon_utils.h"
 
 static int	extension_server_port = 0;
diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h
new file mode 100644
index 0000000000..3e67708b85
--- /dev/null
+++ b/pgxn/neon/extension_server.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.h
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef EXTENSION_SERVER_H
+#define EXTENSION_SERVER_H
+
+void pg_init_extension_server(void);
+
+#endif							/* EXTENSION_SERVER_H */
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index a14288b33a..1f456d9a3f 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -29,6 +29,7 @@
 #include "utils/guc.h"
 #include "utils/wait_event.h"
 
+#include "extension_server.h"
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index c3afecc679..a0f8c97497 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -25,12 +25,11 @@ extern int	wal_acceptor_connection_timeout;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 
-extern void pg_init_extension_server(void);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
 extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
 #endif							/* NEON_H */
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index ce554c89df..1fb4ed9522 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -6,6 +6,7 @@
 
 #include "postgres.h"
 
+#include "neon_utils.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 
@@ -14,7 +15,7 @@
  *
  * Returns -1 if the character is not a hexadecimal digit.
  */
-int
+static int
 HexDecodeChar(char c)
 {
 	if (c >= '0' && c <= '9')
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 10d41db102..89683714f1 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -12,7 +12,7 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
-extern void disable_core_dump();
+void        disable_core_dump(void);
 
 #ifndef WALPROPOSER_LIB
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 0d5007ef73..10487636ae 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1460,7 +1460,7 @@ RecvAppendResponses(Safekeeper *sk)
 }
 
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
-void
+static void
 ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
 {
 	uint8		nkeys;
@@ -1590,9 +1590,9 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 Safekeeper *
 GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 {
-	*donor_lsn = InvalidXLogRecPtr;
 	Safekeeper *donor = NULL;
 	int			i;
+	*donor_lsn = InvalidXLogRecPtr;
 
 	if (wp->n_votes < wp->quorum)
 	{
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 61a2a54809..7f07913fa6 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -398,7 +398,7 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }
 
-void
+static void
 replication_feedback_set(PageserverFeedback *rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);

From e69a25542b4b696bcec6cd47aec62c06217a0958 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 5 Mar 2024 16:26:51 +0100
Subject: [PATCH 25/52] Minor improvements to tiered compaction (#7020)

Minor non-functional improvements to tiered compaction, mostly
consisting of comment fixes.

Followup of  #6830, part of #6768
---
 pageserver/compaction/src/compact_tiered.rs  | 21 ++++---------
 pageserver/compaction/src/identify_levels.rs | 19 ++++++------
 pageserver/compaction/src/interface.rs       | 31 ++++++++++----------
 pageserver/compaction/src/simulator.rs       |  1 -
 pageserver/src/tenant/timeline/compaction.rs |  1 -
 5 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 52219a014c..60fc7ac925 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -63,7 +63,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
         );
 
         // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
+        // each file in this level spans an LSN range up to 1.75x target file
         // size. That should give us enough slop that if we created a slightly
         // oversized L0 layer, e.g. because flushing the in-memory layer was
         // delayed for some reason, we don't consider the oversized layer to
@@ -248,7 +248,6 @@ enum CompactionStrategy {
     CreateImage,
 }
 
-#[allow(dead_code)] // Todo
 struct CompactionJob<E: CompactionJobExecutor> {
     key_range: Range<E::Key>,
     lsn_range: Range<Lsn>,
@@ -345,7 +344,7 @@ where
     ///
     /// TODO: Currently, this is called exactly once for the level, and we
     /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
+    /// write a new set of deltas. In the future, this should try to partition
     /// the key space, and make the decision separately for each partition.
     async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
         let job = &self.jobs[job_id.0];
@@ -709,18 +708,6 @@ where
     }
 }
 
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
 // Take previous partitioning, based on the image layers below.
 //
 // Candidate is at the front:
@@ -739,6 +726,10 @@ struct WindowElement<K> {
     last_key: K,  // inclusive
     accum_size: u64,
 }
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
 struct Window<K> {
     elems: VecDeque<WindowElement<K>>,
 
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index ef388fd92b..98dd46925c 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,5 +1,5 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
+//! An LSM tree consists of multiple levels, each exponentially larger than the
+//! previous level. And each level consists of multiple "tiers". With tiered
 //! compaction, a level is compacted when it has accumulated more than N tiers,
 //! forming one tier on the next level.
 //!
@@ -170,13 +170,6 @@ where
     })
 }
 
-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
 impl<L> Level<L> {
     /// Count the number of deltas stacked on each other.
     pub fn depth<K>(&self) -> u64
@@ -184,6 +177,11 @@ impl<L> Level<L> {
         K: CompactionKey,
         L: CompactionLayer<K>,
     {
+        struct Event<K> {
+            key: K,
+            layer_idx: usize,
+            start: bool,
+        }
         let mut events: Vec<Event<K>> = Vec::new();
         for (idx, l) in self.layers.iter().enumerate() {
             events.push(Event {
@@ -202,7 +200,7 @@ impl<L> Level<L> {
         // Sweep the key space left to right. Stop at each distinct key, and
         // count the number of deltas on top of the highest image at that key.
         //
-        // This is a little enefficient, as we walk through the active_set on
+        // This is a little inefficient, as we walk through the active_set on
         // every key. We could increment/decrement a counter on each step
         // instead, but that'd require a bit more complex bookkeeping.
         let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
@@ -236,6 +234,7 @@ impl<L> Level<L> {
                 }
             }
         }
+        debug_assert_eq!(active_set, BTreeSet::new());
         max_depth
     }
 }
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 979ceebf0e..2bb2e749c0 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,12 +4,12 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use async_trait::async_trait;
+use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
 use utils::lsn::Lsn;
 
 /// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
 pub trait CompactionJobExecutor {
     // Type system.
     //
@@ -17,8 +17,7 @@ pub trait CompactionJobExecutor {
     // compaction doesn't distinguish whether they are stored locally or
     // remotely.
     //
-    // The keyspace is defined by CompactionKey trait.
-    //
+    // The keyspace is defined by the CompactionKey trait.
     type Key: CompactionKey;
 
     type Layer: CompactionLayer<Self::Key> + Clone;
@@ -35,27 +34,27 @@ pub trait CompactionJobExecutor {
     // ----
 
     /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
+    fn get_layers(
         &mut self,
         key_range: &Range<Self::Key>,
         lsn_range: &Range<Lsn>,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
+    ) -> impl Future<Output = anyhow::Result<Vec<Self::Layer>>> + Send;
 
-    async fn get_keyspace(
+    fn get_keyspace(
         &mut self,
         key_range: &Range<Self::Key>,
         lsn: Lsn,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+    ) -> impl Future<Output = anyhow::Result<CompactionKeySpace<Self::Key>>> + Send;
 
     /// NB: This is a pretty expensive operation. In the real pageserver
     /// implementation, it downloads the layer, and keeps it resident
     /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
+    fn downcast_delta_layer(
         &self,
         layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+    ) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;
 
     // ----
     // Functions to execute the plan
@@ -63,33 +62,33 @@ pub trait CompactionJobExecutor {
 
     /// Create a new image layer, materializing all the values in the key range,
     /// at given 'lsn'.
-    async fn create_image(
+    fn create_image(
         &mut self,
         lsn: Lsn,
         key_range: &Range<Self::Key>,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 
     /// Create a new delta layer, containing all the values from 'input_layers'
     /// in the given key and LSN range.
-    async fn create_delta(
+    fn create_delta(
         &mut self,
         lsn_range: &Range<Lsn>,
         key_range: &Range<Self::Key>,
         input_layers: &[Self::DeltaLayer],
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 
     /// Delete a layer. The compaction implementation will call this only after
     /// all the create_image() or create_delta() calls that deletion of this
     /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
+    /// background tasks, like uploading the index json file to remote storage.
     /// it is the implementation's responsibility to track those.
-    async fn delete_layer(
+    fn delete_layer(
         &mut self,
         layer: &Self::Layer,
         ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 }
 
 pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 6d07038dcd..def7983e75 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -429,7 +429,6 @@ impl From<&Arc<MockImageLayer>> for MockLayer {
     }
 }
 
-#[async_trait]
 impl interface::CompactionJobExecutor for MockTimeline {
     type Key = Key;
     type Layer = MockLayer;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 914e3948ef..8b544b1c3a 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -134,7 +134,6 @@ struct ResidentDeltaLayer(ResidentLayer);
 #[derive(Clone)]
 struct ResidentImageLayer(ResidentLayer);
 
-#[async_trait]
 impl CompactionJobExecutor for TimelineAdaptor {
     type Key = crate::repository::Key;
 

From 15b3665dc4810c4539dc3c40e94520506a56154d Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 5 Mar 2024 19:32:58 +0400
Subject: [PATCH 26/52] proxy: fix bug with populating the data (#7023)

## Problem

Branch/project and coldStart were not populated to data events.

## Summary of changes

Populate it. Also added logging for the coldstart info.
---
 proxy/src/auth/backend/link.rs     | 2 ++
 proxy/src/console/messages.rs      | 3 ++-
 proxy/src/console/provider/neon.rs | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index ec7d891247..7db76f3d9e 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,6 +102,8 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
+    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
+    info!(?cold_start_info, "woken up a compute node");
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 85adb31654..102076f2c6 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -101,9 +101,10 @@ pub struct MetricsAuxInfo {
     pub cold_start_info: Option<ColdStartInfo>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
+    #[default]
     Unknown = 0,
     Warm = 1,
     PoolHit = 2,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 71b34cb676..f3befa33e0 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -259,6 +259,9 @@ impl super::Api for Api {
         }
 
         let node = self.do_wake_compute(ctx, user_info).await?;
+        ctx.set_project(node.aux.clone());
+        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
+        info!(?cold_start_info, "woken up a compute node");
         let (_, cached) = self.caches.node_info.insert(key.clone(), node);
         info!(key = &*key, "created a cache entry for compute node info");
 

From 2daa2f1d1059c033ac25718c6e67d7b3953c20a6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 5 Mar 2024 15:41:05 +0000
Subject: [PATCH 27/52] test: disable large slru basebackup bench in ci (#7025)

The test is flaky due to
https://github.com/neondatabase/neon/issues/7006.
---
 .../pageserver/pagebench/test_large_slru_basebackup.py       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index e2e7fffdbe..921b7c5b76 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,5 +1,6 @@
 import asyncio
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -19,6 +20,10 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("n_tenants", [10])
 @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
+)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,

From eacdc179dc0e396ef12a098478cb807be4f847cf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 5 Mar 2024 18:03:51 +0100
Subject: [PATCH 28/52] fixup(#6991): it broke the macOS build (#7024)

---
 pageserver/src/virtual_file/io_engine.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 5fef826477..e369d28711 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -248,6 +248,7 @@ impl IoEngine {
                 .await
                 .expect("failed to join blocking code most likely it panicked, panicking as well")
             }
+            #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => work.await,
         }
     }

From 2f88e7a921b4b37f3aa992bc1b419d24b24b965b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 6 Mar 2024 02:40:23 +0100
Subject: [PATCH 29/52] Move compaction code to compaction.rs (#7026)

Moves some of the (legacy) compaction code to compaction.rs. No
functional changes, just moves of code.

Before, compaction.rs was only for the new tiered compaction mechanism,
now it's for both the old and new mechanisms.

Part of #6768
---
 pageserver/src/tenant/timeline.rs            | 693 +-----------------
 pageserver/src/tenant/timeline/compaction.rs | 706 ++++++++++++++++++-
 2 files changed, 703 insertions(+), 696 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 309ec2e829..37acebb10a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,6 @@ use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
-use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::AUX_FILES_KEY,
@@ -35,7 +34,7 @@ use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 use std::{
     array,
-    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
     sync::atomic::AtomicU64,
 };
 use std::{
@@ -57,7 +56,7 @@ use crate::tenant::{
     metadata::TimelineMetadata,
 };
 use crate::{
-    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
+    context::{DownloadBehavior, RequestContext},
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
@@ -1146,118 +1145,6 @@ impl Timeline {
         }
     }
 
-    /// TODO: cancellation
-    async fn compact_legacy(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-
-        // Is the timeline being deleted?
-        if self.is_stopping() {
-            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
-        }
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        // FIXME: the match should only cover repartitioning, not the next steps
-        match self
-            .repartition(
-                self.get_last_record_lsn(),
-                self.get_compaction_target_size(),
-                flags,
-                ctx,
-            )
-            .await
-        {
-            Ok((partitioning, lsn)) => {
-                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
-                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
-
-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
-                // 3. Create new image layers for partitions that have been modified
-                // "enough".
-                let layers = self
-                    .create_image_layers(
-                        &partitioning,
-                        lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }
-
-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
-                    error!("could not compact, repartitioning keyspace failed: {err:?}");
-                }
-            }
-        };
-
-        Ok(())
-    }
-
     /// Mutate the timeline with a [`TimelineWriter`].
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
@@ -3766,12 +3653,6 @@ impl Timeline {
     }
 }
 
-#[derive(Default)]
-struct CompactLevel0Phase1Result {
-    new_layers: Vec<ResidentLayer>,
-    deltas_to_compact: Vec<Layer>,
-}
-
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -3825,577 +3706,7 @@ impl DurationRecorder {
     }
 }
 
-#[derive(Default)]
-struct CompactLevel0Phase1StatsBuilder {
-    version: Option<u64>,
-    tenant_id: Option<TenantShardId>,
-    timeline_id: Option<TimelineId>,
-    read_lock_acquisition_micros: DurationRecorder,
-    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
-    read_lock_held_key_sort_micros: DurationRecorder,
-    read_lock_held_prerequisites_micros: DurationRecorder,
-    read_lock_held_compute_holes_micros: DurationRecorder,
-    read_lock_drop_micros: DurationRecorder,
-    write_layer_files_micros: DurationRecorder,
-    level0_deltas_count: Option<usize>,
-    new_deltas_count: Option<usize>,
-    new_deltas_size: Option<u64>,
-}
-
-#[derive(serde::Serialize)]
-struct CompactLevel0Phase1Stats {
-    version: u64,
-    tenant_id: TenantShardId,
-    timeline_id: TimelineId,
-    read_lock_acquisition_micros: RecordedDuration,
-    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
-    read_lock_held_key_sort_micros: RecordedDuration,
-    read_lock_held_prerequisites_micros: RecordedDuration,
-    read_lock_held_compute_holes_micros: RecordedDuration,
-    read_lock_drop_micros: RecordedDuration,
-    write_layer_files_micros: RecordedDuration,
-    level0_deltas_count: usize,
-    new_deltas_count: usize,
-    new_deltas_size: u64,
-}
-
-impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
-    type Error = anyhow::Error;
-
-    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
-        Ok(Self {
-            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
-            tenant_id: value
-                .tenant_id
-                .ok_or_else(|| anyhow!("tenant_id not set"))?,
-            timeline_id: value
-                .timeline_id
-                .ok_or_else(|| anyhow!("timeline_id not set"))?,
-            read_lock_acquisition_micros: value
-                .read_lock_acquisition_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
-            read_lock_held_spawn_blocking_startup_micros: value
-                .read_lock_held_spawn_blocking_startup_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
-            read_lock_held_key_sort_micros: value
-                .read_lock_held_key_sort_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
-            read_lock_held_prerequisites_micros: value
-                .read_lock_held_prerequisites_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
-            read_lock_held_compute_holes_micros: value
-                .read_lock_held_compute_holes_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
-            read_lock_drop_micros: value
-                .read_lock_drop_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            write_layer_files_micros: value
-                .write_layer_files_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
-            level0_deltas_count: value
-                .level0_deltas_count
-                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
-            new_deltas_count: value
-                .new_deltas_count
-                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
-            new_deltas_size: value
-                .new_deltas_size
-                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
-        })
-    }
-}
-
 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
-        mut stats: CompactLevel0Phase1StatsBuilder,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        stats.read_lock_held_spawn_blocking_startup_micros =
-            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas()?;
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
-        stats.level0_deltas_count = Some(level0_deltas.len());
-        // Only compact if enough layers have accumulated.
-        let threshold = self.get_compaction_threshold();
-        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
-            debug!(
-                level0_deltas = level0_deltas.len(),
-                threshold, "too few deltas to compact"
-            );
-            return Ok(CompactLevel0Phase1Result::default());
-        }
-
-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        if cfg!(feature = "testing") {
-            let active = (|| {
-                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
-                false
-            })();
-
-            if active {
-                let mut new_layers = Vec::with_capacity(level0_deltas.len());
-                for delta in &level0_deltas {
-                    // we are just faking these layers as being produced again for this failpoint
-                    new_layers.push(
-                        delta
-                            .download_and_keep_resident()
-                            .await
-                            .context("download layer for failpoint")?,
-                    );
-                }
-                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-                return Ok(CompactLevel0Phase1Result {
-                    new_layers,
-                    deltas_to_compact: level0_deltas,
-                });
-            }
-        }
-
-        // Gather the files to compact in this iteration.
-        //
-        // Start with the oldest Level 0 delta file, and collect any other
-        // level 0 files that form a contiguous sequence, such that the end
-        // LSN of previous file matches the start LSN of the next file.
-        //
-        // Note that if the files don't form such a sequence, we might
-        // "compact" just a single file. That's a bit pointless, but it allows
-        // us to get rid of the level 0 file, and compact the other files on
-        // the next iteration. This could probably made smarter, but such
-        // "gaps" in the sequence of level 0 files should only happen in case
-        // of a crash, partial download from cloud storage, or something like
-        // that, so it's not a big deal in practice.
-        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
-        let mut level0_deltas_iter = level0_deltas.iter();
-
-        let first_level0_delta = level0_deltas_iter.next().unwrap();
-        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
-        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
-
-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
-        for l in level0_deltas_iter {
-            let lsn_range = &l.layer_desc().lsn_range;
-
-            if lsn_range.start != prev_lsn_end {
-                break;
-            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
-            prev_lsn_end = lsn_range.end;
-        }
-        let lsn_range = Range {
-            start: deltas_to_compact
-                .first()
-                .unwrap()
-                .layer_desc()
-                .lsn_range
-                .start,
-            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
-        };
-
-        info!(
-            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
-            lsn_range.start,
-            lsn_range.end,
-            deltas_to_compact.len(),
-            level0_deltas.len()
-        );
-
-        for l in deltas_to_compact.iter() {
-            info!("compact includes {l}");
-        }
-
-        // We don't need the original list of layers anymore. Drop it so that
-        // we don't accidentally use it later in the function.
-        drop(level0_deltas);
-
-        stats.read_lock_held_prerequisites_micros = stats
-            .read_lock_held_spawn_blocking_startup_micros
-            .till_now();
-
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-
-        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
-
-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
-                    }
-                }
-            }
-            prev = Some(next_key.next());
-        }
-        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
-        drop_rlock(guard);
-        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
-
-        // This iterator walks through all key-value pairs from all the layers
-        // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
-
-        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys
-            .iter()
-            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
-            .coalesce(|mut prev, cur| {
-                // Coalesce keys that belong to the same key pair.
-                // This ensures that compaction doesn't put them
-                // into different layer files.
-                // Still limit this by the target file size,
-                // so that we keep the size of the files in
-                // check.
-                if prev.0 == cur.0 && prev.2 < target_file_size {
-                    prev.2 += cur.2;
-                    Ok(prev)
-                } else {
-                    Err((prev, cur))
-                }
-            });
-
-        // Merge the contents of all the input delta layers into a new set
-        // of delta layers, based on the current partitioning.
-        //
-        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
-        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
-        // would be too large. In that case, we also split on the LSN dimension.
-        //
-        // LSN
-        //  ^
-        //  |
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        //
-        //
-        // If one key (X) has a lot of page versions:
-        //
-        // LSN
-        //  ^
-        //  |                                 (X)
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  +--+  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  +--+  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        // TODO: this actually divides the layers into fixed-size chunks, not
-        // based on the partitioning.
-        //
-        // TODO: we should also opportunistically materialize and
-        // garbage collect what we can.
-        let mut new_layers = Vec::new();
-        let mut prev_key: Option<Key> = None;
-        let mut writer: Option<DeltaLayerWriter> = None;
-        let mut key_values_total_size = 0u64;
-        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
-        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
-        {
-            let value = val.load(ctx).await?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
-                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
-                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
-                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
-                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
-                                .await?,
-                        );
-                        writer = None;
-
-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
-            }
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(CompactionError::Other(anyhow::anyhow!(
-                    "failpoint delta-layer-writer-fail-before-finish"
-                )))
-            });
-
-            if !self.shard_identity.is_key_disposable(&key) {
-                if writer.is_none() {
-                    // Create writer if not initiaized yet
-                    writer = Some(
-                        DeltaLayerWriter::new(
-                            self.conf,
-                            self.timeline_id,
-                            self.tenant_shard_id,
-                            key,
-                            if dup_end_lsn.is_valid() {
-                                // this is a layer containing slice of values of the same key
-                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                                dup_start_lsn..dup_end_lsn
-                            } else {
-                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                                lsn_range.clone()
-                            },
-                        )
-                        .await?,
-                    );
-                }
-
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
-
-            if !new_layers.is_empty() {
-                fail_point!("after-timeline-compacted-first-L1");
-            }
-
-            prev_key = Some(key);
-        }
-        if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
-        }
-
-        // Sync layers
-        if !new_layers.is_empty() {
-            // Print a warning if the created layer is larger than double the target size
-            // Add two pages for potential overhead. This should in theory be already
-            // accounted for in the target calculation, but for very small targets,
-            // we still might easily hit the limit otherwise.
-            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
-            for layer in new_layers.iter() {
-                if layer.layer_desc().file_size > warn_limit {
-                    warn!(
-                        %layer,
-                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
-                    );
-                }
-            }
-
-            // The writer.finish() above already did the fsync of the inodes.
-            // We just need to fsync the directory in which these inodes are linked,
-            // which we know to be the timeline directory.
-            //
-            // We use fatal_err() below because the after writer.finish() returns with success,
-            // the in-memory state of the filesystem already has the layer file in its final place,
-            // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = VirtualFile::open(
-                &self
-                    .conf
-                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
-        }
-
-        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
-        stats.new_deltas_count = Some(new_layers.len());
-        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
-
-        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
-            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
-        {
-            Ok(stats_json) => {
-                info!(
-                    stats_json = stats_json.as_str(),
-                    "compact_level0_phase1 stats available"
-                )
-            }
-            Err(e) => {
-                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
-            }
-        }
-
-        Ok(CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact: deltas_to_compact
-                .into_iter()
-                .map(|x| x.drop_eviction_guard())
-                .collect::<Vec<_>>(),
-        })
-    }
-
-    ///
-    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
-    /// as Level 1 files.
-    ///
-    async fn compact_level0(
-        self: &Arc<Self>,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        let CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact,
-        } = {
-            let phase1_span = info_span!("compact_level0_phase1");
-            let ctx = ctx.attached_child();
-            let mut stats = CompactLevel0Phase1StatsBuilder {
-                version: Some(2),
-                tenant_id: Some(self.tenant_shard_id),
-                timeline_id: Some(self.timeline_id),
-                ..Default::default()
-            };
-
-            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
-            let now = tokio::time::Instant::now();
-            stats.read_lock_acquisition_micros =
-                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
-                .instrument(phase1_span)
-                .await?
-        };
-
-        if new_layers.is_empty() && deltas_to_compact.is_empty() {
-            // nothing to do
-            return Ok(());
-        }
-
-        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
-            .await?;
-        Ok(())
-    }
-
     async fn finish_compact_batch(
         self: &Arc<Self>,
         new_deltas: &[ResidentLayer],
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b544b1c3a..74b75dabf0 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,24 +4,32 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
+use std::collections::BinaryHeap;
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
-use super::Timeline;
+use super::layer_manager::LayerManager;
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
 
+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
+use enumset::EnumSet;
 use fail::fail_point;
+use itertools::Itertools;
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::id::TimelineId;
 
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::tenant::PageReconstructError;
-use crate::ZERO_PAGE;
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::{page_cache, ZERO_PAGE};
 
 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -33,6 +41,694 @@ use pageserver_compaction::interface::*;
 
 use super::CompactionError;
 
+impl Timeline {
+    /// TODO: cancellation
+    pub(crate) async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        // FIXME: the match should only cover repartitioning, not the next steps
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                flags,
+                ctx,
+            )
+            .await
+        {
+            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
+                // 2. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
+                // 3. Create new image layers for partitions that have been modified
+                // "enough".
+                let layers = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                if let Some(remote_client) = &self.remote_client {
+                    for layer in layers {
+                        remote_client.schedule_layer_file_upload(layer)?;
+                    }
+                }
+
+                if let Some(remote_client) = &self.remote_client {
+                    // should any new image layer been created, not uploading index_part will
+                    // result in a mismatch between remote_physical_size and layermap calculated
+                    // size, which will fail some tests, but should not be an issue otherwise.
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                //
+                // Suppress error when it's due to cancellation
+                if !self.cancel.is_cancelled() {
+                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                }
+            }
+        };
+
+        Ok(())
+    }
+
+    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
+    /// as Level 1 files.
+    async fn compact_level0(
+        self: &Arc<Self>,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact,
+        } = {
+            let phase1_span = info_span!("compact_level0_phase1");
+            let ctx = ctx.attached_child();
+            let mut stats = CompactLevel0Phase1StatsBuilder {
+                version: Some(2),
+                tenant_id: Some(self.tenant_shard_id),
+                timeline_id: Some(self.timeline_id),
+                ..Default::default()
+            };
+
+            let begin = tokio::time::Instant::now();
+            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let now = tokio::time::Instant::now();
+            stats.read_lock_acquisition_micros =
+                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
+            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
+                .instrument(phase1_span)
+                .await?
+        };
+
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
+        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+        mut stats: CompactLevel0Phase1StatsBuilder,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
+        stats.read_lock_held_spawn_blocking_startup_micros =
+            stats.read_lock_acquisition_micros.till_now(); // set by caller
+        let layers = guard.layer_map();
+        let level0_deltas = layers.get_level0_deltas()?;
+        let mut level0_deltas = level0_deltas
+            .into_iter()
+            .map(|x| guard.get_from_desc(&x))
+            .collect_vec();
+        stats.level0_deltas_count = Some(level0_deltas.len());
+        // Only compact if enough layers have accumulated.
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
+        }
+
+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        if cfg!(feature = "testing") {
+            let active = (|| {
+                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
+                false
+            })();
+
+            if active {
+                let mut new_layers = Vec::with_capacity(level0_deltas.len());
+                for delta in &level0_deltas {
+                    // we are just faking these layers as being produced again for this failpoint
+                    new_layers.push(
+                        delta
+                            .download_and_keep_resident()
+                            .await
+                            .context("download layer for failpoint")?,
+                    );
+                }
+                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+                return Ok(CompactLevel0Phase1Result {
+                    new_layers,
+                    deltas_to_compact: level0_deltas,
+                });
+            }
+        }
+
+        // Gather the files to compact in this iteration.
+        //
+        // Start with the oldest Level 0 delta file, and collect any other
+        // level 0 files that form a contiguous sequence, such that the end
+        // LSN of previous file matches the start LSN of the next file.
+        //
+        // Note that if the files don't form such a sequence, we might
+        // "compact" just a single file. That's a bit pointless, but it allows
+        // us to get rid of the level 0 file, and compact the other files on
+        // the next iteration. This could probably made smarter, but such
+        // "gaps" in the sequence of level 0 files should only happen in case
+        // of a crash, partial download from cloud storage, or something like
+        // that, so it's not a big deal in practice.
+        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
+        let mut level0_deltas_iter = level0_deltas.iter();
+
+        let first_level0_delta = level0_deltas_iter.next().unwrap();
+        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
+        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
+
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        for l in level0_deltas_iter {
+            let lsn_range = &l.layer_desc().lsn_range;
+
+            if lsn_range.start != prev_lsn_end {
+                break;
+            }
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            prev_lsn_end = lsn_range.end;
+        }
+        let lsn_range = Range {
+            start: deltas_to_compact
+                .first()
+                .unwrap()
+                .layer_desc()
+                .lsn_range
+                .start,
+            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
+        };
+
+        info!(
+            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
+            lsn_range.start,
+            lsn_range.end,
+            deltas_to_compact.len(),
+            level0_deltas.len()
+        );
+
+        for l in deltas_to_compact.iter() {
+            info!("compact includes {l}");
+        }
+
+        // We don't need the original list of layers anymore. Drop it so that
+        // we don't accidentally use it later in the function.
+        drop(level0_deltas);
+
+        stats.read_lock_held_prerequisites_micros = stats
+            .read_lock_held_spawn_blocking_startup_micros
+            .till_now();
+
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+
+        let mut all_keys = Vec::new();
+
+        for l in deltas_to_compact.iter() {
+            all_keys.extend(l.load_keys(ctx).await?);
+        }
+
+        // FIXME: should spawn_blocking the rest of this function
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
+
+        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+            if let Some(prev_key) = prev {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
+                    }
+                }
+            }
+            prev = Some(next_key.next());
+        }
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
+        drop_rlock(guard);
+        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let all_values_iter = all_keys.iter();
+
+        // This iterator walks through all keys and is needed to calculate size used by each key
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });
+
+        // Merge the contents of all the input delta layers into a new set
+        // of delta layers, based on the current partitioning.
+        //
+        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
+        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
+        // would be too large. In that case, we also split on the LSN dimension.
+        //
+        // LSN
+        //  ^
+        //  |
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        //
+        //
+        // If one key (X) has a lot of page versions:
+        //
+        // LSN
+        //  ^
+        //  |                                 (X)
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  +--+  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  +--+  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        // TODO: this actually divides the layers into fixed-size chunks, not
+        // based on the partitioning.
+        //
+        // TODO: we should also opportunistically materialize and
+        // garbage collect what we can.
+        let mut new_layers = Vec::new();
+        let mut prev_key: Option<Key> = None;
+        let mut writer: Option<DeltaLayerWriter> = None;
+        let mut key_values_total_size = 0u64;
+        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
+        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_values_iter
+        {
+            let value = val.load(ctx).await?;
+            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+            // We need to check key boundaries once we reach next key or end of layer with the same key
+            if !same_key || lsn == dup_end_lsn {
+                let mut next_key_size = 0u64;
+                let is_dup_layer = dup_end_lsn.is_valid();
+                dup_start_lsn = Lsn::INVALID;
+                if !same_key {
+                    dup_end_lsn = Lsn::INVALID;
+                }
+                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                    next_key_size = next_size;
+                    if key != next_key {
+                        if dup_end_lsn.is_valid() {
+                            // We are writting segment with duplicates:
+                            // place all remaining values of this key in separate segment
+                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                        }
+                        break;
+                    }
+                    key_values_total_size += next_size;
+                    // Check if it is time to split segment: if total keys size is larger than target file size.
+                    // We need to avoid generation of empty segments if next_size > target_file_size.
+                    if key_values_total_size > target_file_size && lsn != next_lsn {
+                        // Split key between multiple layers: such layer can contain only single key
+                        dup_start_lsn = if dup_end_lsn.is_valid() {
+                            dup_end_lsn // new segment with duplicates starts where old one stops
+                        } else {
+                            lsn // start with the first LSN for this key
+                        };
+                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                        break;
+                    }
+                }
+                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                    dup_start_lsn = dup_end_lsn;
+                    dup_end_lsn = lsn_range.end;
+                }
+                if writer.is_some() {
+                    let written_size = writer.as_mut().unwrap().size();
+                    let contains_hole =
+                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                    // check if key cause layer overflow or contains hole...
+                    if is_dup_layer
+                        || dup_end_lsn.is_valid()
+                        || written_size + key_values_total_size > target_file_size
+                        || contains_hole
+                    {
+                        // ... if so, flush previous layer and prepare to write new one
+                        new_layers.push(
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next(), self)
+                                .await?,
+                        );
+                        writer = None;
+
+                        if contains_hole {
+                            // skip hole
+                            next_hole += 1;
+                        }
+                    }
+                }
+                // Remember size of key value because at next iteration we will access next item
+                key_values_total_size = next_key_size;
+            }
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                Err(CompactionError::Other(anyhow::anyhow!(
+                    "failpoint delta-layer-writer-fail-before-finish"
+                )))
+            });
+
+            if !self.shard_identity.is_key_disposable(&key) {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(
+                        DeltaLayerWriter::new(
+                            self.conf,
+                            self.timeline_id,
+                            self.tenant_shard_id,
+                            key,
+                            if dup_end_lsn.is_valid() {
+                                // this is a layer containing slice of values of the same key
+                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                                dup_start_lsn..dup_end_lsn
+                            } else {
+                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                                lsn_range.clone()
+                            },
+                        )
+                        .await?,
+                    );
+                }
+
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
+
+            if !new_layers.is_empty() {
+                fail_point!("after-timeline-compacted-first-L1");
+            }
+
+            prev_key = Some(key);
+        }
+        if let Some(writer) = writer {
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+        }
+
+        // Sync layers
+        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.layer_desc().file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
+                    );
+                }
+            }
+
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
+        stats.new_deltas_count = Some(new_layers.len());
+        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
+
+        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
+            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
+        {
+            Ok(stats_json) => {
+                info!(
+                    stats_json = stats_json.as_str(),
+                    "compact_level0_phase1 stats available"
+                )
+            }
+            Err(e) => {
+                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
+            }
+        }
+
+        Ok(CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact: deltas_to_compact
+                .into_iter()
+                .map(|x| x.drop_eviction_guard())
+                .collect::<Vec<_>>(),
+        })
+    }
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1Result {
+    new_layers: Vec<ResidentLayer>,
+    deltas_to_compact: Vec<Layer>,
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1StatsBuilder {
+    version: Option<u64>,
+    tenant_id: Option<TenantShardId>,
+    timeline_id: Option<TimelineId>,
+    read_lock_acquisition_micros: DurationRecorder,
+    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
+    read_lock_held_prerequisites_micros: DurationRecorder,
+    read_lock_held_compute_holes_micros: DurationRecorder,
+    read_lock_drop_micros: DurationRecorder,
+    write_layer_files_micros: DurationRecorder,
+    level0_deltas_count: Option<usize>,
+    new_deltas_count: Option<usize>,
+    new_deltas_size: Option<u64>,
+}
+
+#[derive(serde::Serialize)]
+struct CompactLevel0Phase1Stats {
+    version: u64,
+    tenant_id: TenantShardId,
+    timeline_id: TimelineId,
+    read_lock_acquisition_micros: RecordedDuration,
+    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
+    read_lock_held_prerequisites_micros: RecordedDuration,
+    read_lock_held_compute_holes_micros: RecordedDuration,
+    read_lock_drop_micros: RecordedDuration,
+    write_layer_files_micros: RecordedDuration,
+    level0_deltas_count: usize,
+    new_deltas_count: usize,
+    new_deltas_size: u64,
+}
+
+impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
+    type Error = anyhow::Error;
+
+    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
+        Ok(Self {
+            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
+            tenant_id: value
+                .tenant_id
+                .ok_or_else(|| anyhow!("tenant_id not set"))?,
+            timeline_id: value
+                .timeline_id
+                .ok_or_else(|| anyhow!("timeline_id not set"))?,
+            read_lock_acquisition_micros: value
+                .read_lock_acquisition_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
+            read_lock_held_spawn_blocking_startup_micros: value
+                .read_lock_held_spawn_blocking_startup_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
+            read_lock_held_prerequisites_micros: value
+                .read_lock_held_prerequisites_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
+            read_lock_held_compute_holes_micros: value
+                .read_lock_held_compute_holes_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
+            read_lock_drop_micros: value
+                .read_lock_drop_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
+            write_layer_files_micros: value
+                .write_layer_files_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
+            level0_deltas_count: value
+                .level0_deltas_count
+                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
+            new_deltas_count: value
+                .new_deltas_count
+                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
+            new_deltas_size: value
+                .new_deltas_size
+                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
+        })
+    }
+}
+
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///

From a3ef50c9b60b2652eb6cc863acf0f4c92ed157a0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 11:26:29 +0000
Subject: [PATCH 30/52] storage controller: use 'lazy' mode for location_config
 (#6987)

## Problem

If large numbers of shards are attached to a pageserver concurrently,
for example after another node fails, it can cause excessive I/O queue
depths due to all the newly attached shards trying to calculate logical
sizes concurrently.

#6907 added the `lazy` flag to handle this.

## Summary of changes

- Use `lazy=true` from all /location_config calls in the storage
controller Reconciler.
---
 .../attachment_service/src/reconciler.rs      | 26 +++++++++++++------
 .../attachment_service/src/service.rs         |  1 +
 control_plane/src/pageserver.rs               |  3 ++-
 pageserver/client/src/mgmt_api.rs             | 25 ++++++++++++------
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index b633b217c7..d4f940373f 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -104,6 +104,7 @@ impl Reconciler {
         node_id: NodeId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
+        lazy: bool,
     ) -> anyhow::Result<()> {
         let node = self
             .pageservers
@@ -118,7 +119,7 @@ impl Reconciler {
         let client =
             mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
         client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
             .await?;
         tracing::info!("location_config({}) complete: {:?}", node_id, config);
 
@@ -315,8 +316,13 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
+        self.location_config(
+            origin_ps_id,
+            stale_conf,
+            Some(Duration::from_secs(10)),
+            false,
+        )
+        .await?;
 
         let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
 
@@ -350,7 +356,8 @@ impl Reconciler {
         );
 
         tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None).await?;
+        self.location_config(dest_ps_id, dest_conf, None, false)
+            .await?;
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
@@ -382,7 +389,7 @@ impl Reconciler {
             None,
             Some(LocationConfigSecondary { warm: true }),
         );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
             .await?;
         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
@@ -405,7 +412,7 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
             .await?;
         self.observed.locations.insert(
             dest_ps_id,
@@ -491,7 +498,10 @@ impl Reconciler {
                         wanted_conf.generation = generation.into();
                     }
                     tracing::info!(%node_id, "Observed configuration requires update.");
-                    self.location_config(node_id, wanted_conf, None).await?;
+                    // Use lazy=true, because we may run many of Self concurrently, and do not want to
+                    // overload the pageserver with logical size calculations.
+                    self.location_config(node_id, wanted_conf, None, true)
+                        .await?;
                     self.compute_notify().await?;
                 }
             }
@@ -543,7 +553,7 @@ impl Reconciler {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(node_id, conf, None).await?;
+            self.location_config(node_id, conf, None, false).await?;
         }
 
         Ok(())
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 4209b62db3..bc34c9dcf6 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -468,6 +468,7 @@ impl Service {
                         tenant_conf: models::TenantConfig::default(),
                     },
                     None,
+                    false,
                 )
                 .await
             {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 7d0c07a938..b2904c1191 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -537,10 +537,11 @@ impl PageServerNode {
         tenant_shard_id: TenantShardId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
+        lazy: bool,
     ) -> anyhow::Result<()> {
         Ok(self
             .http_client
-            .location_config(tenant_shard_id, config, flush_ms)
+            .location_config(tenant_shard_id, config, flush_ms, lazy)
             .await?)
     }
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 969d0d99c0..4dde7bdf0b 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -251,21 +251,30 @@ impl Client {
         tenant_shard_id: TenantShardId,
         config: LocationConfig,
         flush_ms: Option<std::time::Duration>,
+        lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
             tenant_id: tenant_shard_id,
             config,
         };
-        let path = format!(
+
+        let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/location_config",
             self.mgmt_api_endpoint, tenant_shard_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-        self.request(Method::PUT, &path, &req_body).await?;
+        ))
+        // Should always work: mgmt_api_endpoint is configuration, not user input.
+        .expect("Cannot build URL");
+
+        if lazy {
+            path.query_pairs_mut().append_pair("lazy", "true");
+        }
+
+        if let Some(flush_ms) = flush_ms {
+            path.query_pairs_mut()
+                .append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
+        }
+
+        self.request(Method::PUT, path, &req_body).await?;
         Ok(())
     }
 

From 4a31e18c81edbfdf78fddcc8cba6391d64dc169c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 13:56:30 +0000
Subject: [PATCH 31/52] storage controller: include stripe size in compute
 notifications (#6974)

## Problem

- The storage controller is the source of truth for a tenant's stripe
size, but doesn't currently have a way to propagate that to compute:
we're just using the default stripe size everywhere.

Closes: https://github.com/neondatabase/neon/issues/6903

## Summary of changes

- Include stripe size in `ComputeHookNotifyRequest`
- Include stripe size in `LocationConfigResponse`

The stripe size is optional: it will only be advertised for
multi-sharded tenants. This enables the controller to defer the choice
of stripe size until we split a tenant for the first time.
---
 .../attachment_service/src/compute_hook.rs    | 258 ++++++++++++++----
 .../attachment_service/src/reconciler.rs      |   7 +-
 .../attachment_service/src/service.rs         |  34 ++-
 control_plane/src/bin/neon_local.rs           |   2 +-
 control_plane/src/endpoint.rs                 |  10 +-
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/openapi_spec.yml          |   4 +
 pageserver/src/http/routes.rs                 |  19 +-
 pageserver/src/tenant.rs                      |   5 +
 test_runner/regress/test_sharding_service.py  |  26 +-
 10 files changed, 291 insertions(+), 76 deletions(-)

diff --git a/control_plane/attachment_service/src/compute_hook.rs b/control_plane/attachment_service/src/compute_hook.rs
index b5e90491c6..bebc62ac2f 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -19,8 +19,66 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
 pub(crate) const API_CONCURRENCY: usize = 32;
 
-pub(super) struct ComputeHookTenant {
-    shards: Vec<(ShardIndex, NodeId)>,
+struct ShardedComputeHookTenant {
+    stripe_size: ShardStripeSize,
+    shard_count: ShardCount,
+    shards: Vec<(ShardNumber, NodeId)>,
+}
+
+enum ComputeHookTenant {
+    Unsharded(NodeId),
+    Sharded(ShardedComputeHookTenant),
+}
+
+impl ComputeHookTenant {
+    /// Construct with at least one shard's information
+    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
+        if tenant_shard_id.shard_count.count() > 1 {
+            Self::Sharded(ShardedComputeHookTenant {
+                shards: vec![(tenant_shard_id.shard_number, node_id)],
+                stripe_size,
+                shard_count: tenant_shard_id.shard_count,
+            })
+        } else {
+            Self::Unsharded(node_id)
+        }
+    }
+
+    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
+    /// and drops existing content.
+    fn update(
+        &mut self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+        node_id: NodeId,
+    ) {
+        match self {
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
+            }
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.stripe_size == stripe_size
+                    && sharded_tenant.shard_count == tenant_shard_id.shard_count =>
+            {
+                if let Some(existing) = sharded_tenant
+                    .shards
+                    .iter()
+                    .position(|s| s.0 == tenant_shard_id.shard_number)
+                {
+                    sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
+                } else {
+                    sharded_tenant
+                        .shards
+                        .push((tenant_shard_id.shard_number, node_id));
+                    sharded_tenant.shards.sort_by_key(|s| s.0)
+                }
+            }
+            _ => {
+                // Shard count changed: reset struct.
+                *self = Self::new(tenant_shard_id, stripe_size, node_id);
+            }
+        }
+    }
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -33,6 +91,7 @@ struct ComputeHookNotifyRequestShard {
 #[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
     tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
     shards: Vec<ComputeHookNotifyRequestShard>,
 }
 
@@ -63,42 +122,43 @@ pub(crate) enum NotifyError {
 }
 
 impl ComputeHookTenant {
-    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        // Find the highest shard count and drop any shards that aren't
-        // for that shard count.
-        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
-        let Some(shard_count) = shard_count else {
-            // No shards, nothing to do.
-            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
-            return None;
-        };
-
-        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
-        self.shards
-            .sort_by_key(|(shard, _node_id)| shard.shard_number);
-
-        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
-            // We have pageservers for all the shards: emit a configuration update
-            return Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                 tenant_id,
-                shards: self
-                    .shards
-                    .iter()
-                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
-                        shard_number: shard.shard_number,
-                        node_id: *node_id,
-                    })
-                    .collect(),
-            });
-        } else {
-            tracing::info!(
-                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
-                self.shards.len(),
-                shard_count.count()
-            );
-        }
+                shards: vec![ComputeHookNotifyRequestShard {
+                    shard_number: ShardNumber(0),
+                    node_id: *node_id,
+                }],
+                stripe_size: None,
+            }),
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
+            {
+                Some(ComputeHookNotifyRequest {
+                    tenant_id,
+                    shards: sharded_tenant
+                        .shards
+                        .iter()
+                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
+                            shard_number: *shard_number,
+                            node_id: *node_id,
+                        })
+                        .collect(),
+                    stripe_size: Some(sharded_tenant.stripe_size),
+                })
+            }
+            Self::Sharded(sharded_tenant) => {
+                // Sharded tenant doesn't yet have information for all its shards
 
-        None
+                tracing::info!(
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    sharded_tenant.shards.len(),
+                    sharded_tenant.shard_count.count()
+                );
+                None
+            }
+        }
     }
 }
 
@@ -139,7 +199,11 @@ impl ComputeHook {
         };
         let cplane =
             ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
-        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
+        let ComputeHookNotifyRequest {
+            tenant_id,
+            shards,
+            stripe_size,
+        } = reconfigure_request;
 
         let compute_pageservers = shards
             .into_iter()
@@ -156,7 +220,9 @@ impl ComputeHook {
         for (endpoint_name, endpoint) in &cplane.endpoints {
             if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
-                endpoint.reconfigure(compute_pageservers.clone()).await?;
+                endpoint
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .await?;
             }
         }
 
@@ -271,30 +337,26 @@ impl ComputeHook {
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
+        stripe_size: ShardStripeSize,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
         let mut locked = self.state.lock().await;
-        let entry = locked
-            .entry(tenant_shard_id.tenant_id)
-            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
 
-        let shard_index = ShardIndex {
-            shard_count: tenant_shard_id.shard_count,
-            shard_number: tenant_shard_id.shard_number,
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
         };
 
-        let mut set = false;
-        for (existing_shard, existing_node) in &mut entry.shards {
-            if *existing_shard == shard_index {
-                *existing_node = node_id;
-                set = true;
-            }
-        }
-        if !set {
-            entry.shards.push((shard_index, node_id));
-        }
-
-        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
         let Some(reconfigure_request) = reconfigure_request else {
             // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
             // until it does.
@@ -316,3 +378,85 @@ impl ComputeHook {
         }
     }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use super::*;
+
+    #[test]
+    fn tenant_updates() -> anyhow::Result<()> {
+        let tenant_id = TenantId::generate();
+        let mut tenant_state = ComputeHookTenant::new(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(0),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(12345),
+            NodeId(1),
+        );
+
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());
+
+        // Writing the first shard of a multi-sharded situation (i.e. in a split)
+        // resets the tenant state and puts it in an non-notifying state (need to
+        // see all shards)
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(1),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+
+        // Writing the second shard makes it ready to notify
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );
+
+        Ok(())
+    }
+}
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index d4f940373f..0fa6e8e2f8 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -565,7 +565,12 @@ impl Reconciler {
         if let Some(node_id) = self.intent.attached {
             let result = self
                 .compute_hook
-                .notify(self.tenant_shard_id, node_id, &self.cancel)
+                .notify(
+                    self.tenant_shard_id,
+                    node_id,
+                    self.shard.stripe_size,
+                    &self.cancel,
+                )
                 .await;
             if let Err(e) = &result {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index bc34c9dcf6..ff35567ff3 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -283,7 +283,11 @@ impl Service {
                     // emit a compute notification for this. In the case where our observed state does not
                     // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
                     if let Some(attached_at) = tenant_state.stably_attached() {
-                        compute_notifications.push((*tenant_shard_id, attached_at));
+                        compute_notifications.push((
+                            *tenant_shard_id,
+                            attached_at,
+                            tenant_state.shard.stripe_size,
+                        ));
                     }
                 }
             }
@@ -493,7 +497,7 @@ impl Service {
     /// Returns a set of any shards for which notifications where not acked within the deadline.
     async fn compute_notify_many(
         &self,
-        notifications: Vec<(TenantShardId, NodeId)>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
         deadline: Instant,
     ) -> HashSet<TenantShardId> {
         let compute_hook = self.inner.read().unwrap().compute_hook.clone();
@@ -504,11 +508,14 @@ impl Service {
         // Construct an async stream of futures to invoke the compute notify function: we do this
         // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
         let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id)| {
+            .map(|(tenant_shard_id, node_id, stripe_size)| {
                 let compute_hook = compute_hook.clone();
                 let cancel = self.cancel.clone();
                 async move {
-                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
+                    if let Err(e) = compute_hook
+                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
+                        .await
+                    {
                         tracing::error!(
                             %tenant_shard_id,
                             %node_id,
@@ -1396,7 +1403,10 @@ impl Service {
         // First check if this is a creation or an update
         let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
 
-        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let mut result = TenantLocationConfigResponse {
+            shards: Vec::new(),
+            stripe_size: None,
+        };
         let waiters = match create_or_update {
             TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
                 let (create_resp, waiters) =
@@ -1452,6 +1462,11 @@ impl Service {
                             continue;
                         };
 
+                        // Update stripe size
+                        if result.stripe_size.is_none() && shard.shard.count.count() > 1 {
+                            result.stripe_size = Some(shard.shard.stripe_size);
+                        }
+
                         shard.policy = placement_policy;
                         shard.config = tenant_config;
                         if let Some(generation) = update_generation {
@@ -2456,7 +2471,7 @@ impl Service {
                     // as at this point in the split process we have succeeded and this part is infallible:
                     // we will never need to do any special recovery from this state.
 
-                    child_locations.push((child, pageserver));
+                    child_locations.push((child, pageserver, child_shard.stripe_size));
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
@@ -2466,8 +2481,11 @@ impl Service {
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
-        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
+        for (child_id, child_ps, stripe_size) in child_locations {
+            if let Err(e) = compute_hook
+                .notify(child_id, child_ps, stripe_size, &self.cancel)
+                .await
+            {
                 tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                         child_id, child_ps);
                 failed_notifications.push(child_id);
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index cf647a5f9b..1feec5cd9b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1024,7 +1024,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         })
                         .collect::<Vec<_>>()
                 };
-            endpoint.reconfigure(pageservers).await?;
+            endpoint.reconfigure(pageservers, None).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 5a75bc2a1d..10e4c5d69f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -52,6 +52,7 @@ use compute_api::spec::RemoteExtSpec;
 use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -735,7 +736,11 @@ impl Endpoint {
         }
     }
 
-    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
+    pub async fn reconfigure(
+        &self,
+        mut pageservers: Vec<(Host, u16)>,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
             let file = std::fs::File::open(spec_path)?;
@@ -765,6 +770,9 @@ impl Endpoint {
         let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
         assert!(!pageserver_connstr.is_empty());
         spec.pageserver_connstring = Some(pageserver_connstr);
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
+        }
 
         let client = reqwest::Client::new();
         let response = client
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d583866290..57497e3831 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -435,6 +435,8 @@ pub struct TenantShardLocation {
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigResponse {
     pub shards: Vec<TenantShardLocation>,
+    // If the shards' ShardCount count is >1, stripe_size will be set.
+    pub stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 19b5fb7e79..d924224a32 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1339,6 +1339,10 @@ components:
           type: array
           items:
             $ref: "#/components/schemas/TenantShardLocation"
+        stripe_size:
+          description: If multiple shards are present, this field contains the sharding stripe size, else it is null.
+          type: integer
+          nullable: true
     TenantShardLocation:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9d92fbaee0..6aaf1ab27e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1451,11 +1451,12 @@ async fn put_tenant_location_config_handler(
         tenant::SpawnMode::Eager
     };
 
-    let attached = state
+    let tenant = state
         .tenant_manager
         .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
-        .await?
-        .is_some();
+        .await?;
+    let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size());
+    let attached = tenant.is_some();
 
     if let Some(_flush_ms) = flush {
         match state
@@ -1477,12 +1478,20 @@ async fn put_tenant_location_config_handler(
     // This API returns a vector of pageservers where the tenant is attached: this is
     // primarily for use in the sharding service.  For compatibilty, we also return this
     // when called directly on a pageserver, but the payload is always zero or one shards.
-    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    let mut response = TenantLocationConfigResponse {
+        shards: Vec::new(),
+        stripe_size: None,
+    };
     if attached {
         response.shards.push(TenantShardLocation {
             shard_id: tenant_shard_id,
             node_id: state.conf.id,
-        })
+        });
+        if tenant_shard_id.shard_count.count() > 1 {
+            // Stripe size should be set if we are attached
+            debug_assert!(stripe_size.is_some());
+            response.stripe_size = stripe_size;
+        }
     }
 
     json_response(StatusCode::OK, response)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3423b50eaa..b24c06c4da 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -22,6 +22,7 @@ use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -2086,6 +2087,10 @@ impl Tenant {
         &self.tenant_shard_id
     }
 
+    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
+        self.shard_identity.stripe_size
+    }
+
     pub(crate) fn get_generation(&self) -> Generation {
         self.generation
     }
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index bc77dfd084..aecc244a47 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,7 +1,7 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 import pytest
 from fixtures.log_helper import log
@@ -443,10 +443,12 @@ def test_sharding_service_compute_hook(
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
-    expect = {
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
         "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
     }
+    assert notifications[0] == expect
 
     env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
 
@@ -460,6 +462,7 @@ def test_sharding_service_compute_hook(
     log.info(f"notifications: {notifications}")
     expect = {
         "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
         "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
     }
 
@@ -475,10 +478,27 @@ def test_sharding_service_compute_hook(
 
     def received_restart_notification():
         assert len(notifications) == 3
-        assert notifications[1] == expect
+        assert notifications[2] == expect
 
     wait_until(10, 1, received_restart_notification)
 
+    # Splitting a tenant should cause its stripe size to become visible in the compute notification
+    env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2)
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": 32768,
+        "shards": [
+            {"node_id": int(env.pageservers[1].id), "shard_number": 0},
+            {"node_id": int(env.pageservers[1].id), "shard_number": 1},
+        ],
+    }
+
+    def received_split_notification():
+        assert len(notifications) == 4
+        assert notifications[3] == expect
+
+    wait_until(10, 1, received_split_notification)
+
     env.attachment_service.consistency_check()
 
 
From 5dc2088cf3dd2ff7ed984a337e7331f5a7eabf6c Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Wed, 6 Mar 2024 10:52:24 -0500
Subject: [PATCH 32/52] fix(test): drop subscription when test completes
 (#6975)

This pull request mitigates
https://github.com/neondatabase/neon/issues/6969, but the longer-term
problem is that we cannot properly stop Postgres if there is a
subscription.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_neon_superuser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index e0364dd13f..fd31df84da 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,12 +1,9 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.utils import wait_until
 
 
-@skip_on_postgres(
-    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
-)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
     env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
@@ -97,3 +94,6 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
         assert cur.fetchall()[0][0] != "<insufficient privilege>"
         cur.execute("RESET ROLE")
         cur.execute("DROP ROLE not_a_superuser")
+        query = "DROP SUBSCRIPTION sub CASCADE"
+        log.info(f"Dropping subscription: {query}")
+        cur.execute(query)

From a9a4a76d1394e330d8ff91188c0987a19bbbdf3a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Mar 2024 16:47:32 +0000
Subject: [PATCH 33/52] storage controller: misc fixes  (#7036)

## Problem

Collection of small changes, batched together to reduce CI overhead.

## Summary of changes

- Layer download messages include size -- this is useful when watching a
pageserver hydrate its on disk cache in the log.
- Controller migrate API could put an invalid NodeId into TenantState
- Scheduling errors during tenant create could result in creating some
shards and not others.
- Consistency check could give hard-to-understand failures in tests if a
reconcile was in process: explicitly fail the check if reconciles are in
progress instead.
---
 .../attachment_service/src/service.rs         | 64 +++++++++++++------
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ff35567ff3..d162ab5c65 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1159,9 +1159,12 @@ impl Service {
 
         let (waiters, response_shards) = {
             let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             let mut response_shards = Vec::new();
+            let mut schcedule_error = None;
 
             for tenant_shard_id in create_ids {
                 tracing::info!("Creating shard {tenant_shard_id}...");
@@ -1198,23 +1201,20 @@ impl Service {
                         continue;
                     }
                     Entry::Vacant(entry) => {
-                        let mut state = TenantState::new(
+                        let state = entry.insert(TenantState::new(
                             tenant_shard_id,
                             ShardIdentity::from_params(
                                 tenant_shard_id.shard_number,
                                 &create_req.shard_parameters,
                             ),
                             placement_policy.clone(),
-                        );
+                        ));
 
                         state.generation = initial_generation;
                         state.config = create_req.config.clone();
-
-                        state.schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        if let Err(e) = state.schedule(scheduler) {
+                            schcedule_error = Some(e);
+                        }
 
                         // Only include shards in result if we are attaching: the purpose
                         // of the response is to tell the caller where the shards are attached.
@@ -1228,24 +1228,27 @@ impl Service {
                                 generation: generation.into().unwrap(),
                             });
                         }
-                        entry.insert(state)
                     }
                 };
             }
 
-            // Take a snapshot of pageservers
-            let pageservers = locked.nodes.clone();
+            // If we failed to schedule shards, then they are still created in the controller,
+            // but we return an error to the requester to avoid a silent failure when someone
+            // tries to e.g. create a tenant whose placement policy requires more nodes than
+            // are present in the system.  We do this here rather than in the above loop, to
+            // avoid situations where we only create a subset of shards in the tenant.
+            if let Some(e) = schcedule_error {
+                return Err(ApiError::Conflict(format!(
+                    "Failed to schedule shard(s): {e}"
+                )));
+            }
 
-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-
-            let waiters = locked
-                .tenants
+            let waiters = tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| {
                     shard.maybe_reconcile(
                         result_tx.clone(),
-                        &pageservers,
+                        nodes,
                         &compute_hook,
                         &self.config,
                         &self.persistence,
@@ -2516,6 +2519,19 @@ impl Service {
             let compute_hook = locked.compute_hook.clone();
             let (nodes, tenants, scheduler) = locked.parts_mut();
 
+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if node.availability != NodeAvailability::Active {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+            }
+
             let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                 return Err(ApiError::NotFound(
                     anyhow::anyhow!("Tenant shard not found").into(),
@@ -2645,6 +2661,18 @@ impl Service {
                 .map(|t| t.to_persistent())
                 .collect::<Vec<_>>();
 
+            // This method can only validate the state of an idle system: if a reconcile is in
+            // progress, fail out early to avoid giving false errors on state that won't match
+            // between database and memory under a ReconcileResult is processed.
+            for t in locked.tenants.values() {
+                if t.reconciler.is_some() {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} reconciliation in progress",
+                        t.tenant_shard_id
+                    )));
+                }
+            }
+
             (expect_nodes, expect_shards)
         };
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e14a2f22cf..6c46b83622 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -976,7 +976,7 @@ impl LayerInner {
                 }
 
                 self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
 
                 Ok(permit)
             }

From f40b13d801782535737530118fbd6b85ef542658 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 6 Mar 2024 17:09:54 +0000
Subject: [PATCH 34/52] Update client libs for test_runner/pg_clients to their
 latest versions (#7022)

## Problem
Closes https://github.com/neondatabase/neon/security/dependabot/56
Supersedes https://github.com/neondatabase/neon/pull/7013

Workflow run:
https://github.com/neondatabase/neon/actions/runs/8157302480

## Summary of changes
- Update client libs for `test_runner/pg_clients` to their latest
versions
---
 .../pg_clients/csharp/npgsql/Dockerfile       |   4 +-
 .../csharp/npgsql/csharp-npgsql.csproj        |   4 +-
 test_runner/pg_clients/java/jdbc/Dockerfile   |   4 +-
 .../pg_clients/python/asyncpg/Dockerfile      |   2 +-
 .../python/asyncpg/requirements.txt           |   2 +-
 .../pg_clients/python/pg8000/Dockerfile       |   2 +-
 .../pg_clients/python/pg8000/requirements.txt |   2 +-
 .../pg_clients/rust/tokio-postgres/Cargo.lock | 340 ++++++++++--------
 .../pg_clients/rust/tokio-postgres/Cargo.toml |   2 +-
 .../pg_clients/rust/tokio-postgres/Dockerfile |   2 +-
 .../swift/PostgresClientKitExample/Dockerfile |   4 +-
 .../swift/PostgresNIOExample/Dockerfile       |   4 +-
 .../swift/PostgresNIOExample/Package.resolved |  37 +-
 .../swift/PostgresNIOExample/Package.swift    |   4 +-
 .../typescript/postgresql-client/Dockerfile   |   2 +-
 .../postgresql-client/package-lock.json       |  75 ++--
 .../typescript/postgresql-client/package.json |   2 +-
 .../typescript/serverless-driver/Dockerfile   |   2 +-
 .../serverless-driver/package-lock.json       |  16 +-
 .../typescript/serverless-driver/package.json |   4 +-
 20 files changed, 291 insertions(+), 223 deletions(-)

diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile
index b23eb2e5eb..71717a6006 100644
--- a/test_runner/pg_clients/csharp/npgsql/Dockerfile
+++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build
+FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
 WORKDIR /source
 
 COPY *.csproj .
@@ -7,7 +7,7 @@ RUN dotnet restore
 COPY . .
 RUN dotnet publish -c release -o /app --no-restore
 
-FROM mcr.microsoft.com/dotnet/runtime:7.0
+FROM mcr.microsoft.com/dotnet/runtime:8.0
 WORKDIR /app
 COPY --from=build /app .
 
diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index bb4427f2c4..50243e3ea7 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -2,13 +2,13 @@
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net7.0</TargetFramework>
+    <TargetFramework>net8.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="7.0.4" />
+    <PackageReference Include="Npgsql" Version="8.0.2" />
   </ItemGroup>
 
 </Project>
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 74eb9bdc32..7e074e07b8 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,10 +1,10 @@
-FROM openjdk:20
+FROM openjdk:21
 WORKDIR /source
 
 COPY . .
 
 WORKDIR /app
-RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \
+RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \
     javac -d /app /source/Example.java
 
 CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile
index 8b6d56b8fb..f2cc37a7bb 100644
--- a/test_runner/pg_clients/python/asyncpg/Dockerfile
+++ b/test_runner/pg_clients/python/asyncpg/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt
index b33c21474c..61972959a9 100644
--- a/test_runner/pg_clients/python/asyncpg/requirements.txt
+++ b/test_runner/pg_clients/python/asyncpg/requirements.txt
@@ -1 +1 @@
-asyncpg==0.27.0
+asyncpg==0.29.0
diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile
index ebef1f9059..ee1de20da5 100644
--- a/test_runner/pg_clients/python/pg8000/Dockerfile
+++ b/test_runner/pg_clients/python/pg8000/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index a8407c3cb0..e086a937e6 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.29.8
+pg8000==1.30.5
 scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 3ac0f16e4b..a4a2426b97 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "async-trait"
-version = "0.1.74"
+version = "0.1.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -51,9 +51,9 @@ dependencies = [
 
 [[package]]
 name = "base64"
-version = "0.21.4"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
 name = "bitflags"
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
 
 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.14.0"
+version = "3.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"
 
 [[package]]
 name = "byteorder"
@@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
-dependencies = [
- "libc",
-]
+checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"
 
 [[package]]
 name = "cfg-if"
@@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "core-foundation"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -121,15 +118,15 @@ dependencies = [
 
 [[package]]
 name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
 dependencies = [
  "libc",
 ]
@@ -157,12 +154,12 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.3.5"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
 
 [[package]]
 name = "futures"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -215,9 +212,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -225,15 +222,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -242,15 +239,15 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -259,21 +256,21 @@ dependencies = [
 
 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
 
 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -299,9 +296,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.10"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
+checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
 dependencies = [
  "cfg-if",
  "libc",
@@ -310,9 +307,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.0"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "hmac"
@@ -325,9 +322,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
 
 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
 [[package]]
 name = "lock_api"
@@ -362,9 +359,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
 [[package]]
 name = "md-5"
@@ -378,28 +375,28 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.6.4"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
 dependencies = [
  "adler",
 ]
 
 [[package]]
 name = "mio"
-version = "0.8.8"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "wasi",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -422,26 +419,26 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.1"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "openssl"
-version = "0.10.60"
+version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.96"
+version = "0.9.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
+checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
 dependencies = [
  "cc",
  "libc",
@@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.4.1",
+ "redox_syscall",
  "smallvec",
- "windows-targets",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
 name = "percent-encoding"
-version = "2.3.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
 [[package]]
 name = "phf"
@@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.27"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
+checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
 
 [[package]]
 name = "postgres-native-tls"
@@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.69"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.33"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
@@ -640,15 +637,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.4.1"
@@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
 [[package]]
 name = "rustix"
-version = "0.38.19"
+version = "0.38.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "schannel"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -753,18 +741,18 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.1"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
 
 [[package]]
 name = "socket2"
-version = "0.5.4"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e"
+checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "syn"
-version = "2.0.38"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -797,15 +785,14 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.8.0"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
  "cfg-if",
  "fastrand",
- "redox_syscall 0.3.5",
  "rustix",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.33.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
  "backtrace",
  "bytes",
@@ -836,14 +823,14 @@ dependencies = [
  "pin-project-lite",
  "socket2",
  "tokio-macros",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -888,9 +875,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.9"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
  "bytes",
  "futures-core",
@@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
 
 [[package]]
 name = "unicode-ident"
@@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
  "tinyvec",
 ]
@@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
-name = "wasm-bindgen"
-version = "0.2.87"
+name = "wasite"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -976,9 +969,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
@@ -991,9 +984,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -1001,9 +994,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1014,15 +1007,15 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "web-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -1030,11 +1023,12 @@ dependencies = [
 
 [[package]]
 name = "whoami"
-version = "1.4.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
 dependencies = [
- "wasm-bindgen",
+ "redox_syscall",
+ "wasite",
  "web-sys",
 ]
 
@@ -1044,7 +1038,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -1053,13 +1056,28 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]
 
 [[package]]
@@ -1068,38 +1086,80 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
index 6f100aafd5..0f420e5b06 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -9,7 +9,7 @@ publish = false
 [dependencies]
 native-tls = "0.2.11"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.33", features=["rt", "macros"] }
+tokio = { version = "1.36", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"
 
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
index 1d3709803e..8611e66cbb 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.73
+FROM rust:1.76
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index 9538cf4ed4..0402838820 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 61e1d1bba6..9130e0973f 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 9f13106011..023e03a7b1 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -5,8 +5,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab",
-        "version" : "1.16.0"
+        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
+        "version" : "1.20.2"
       }
     },
     {
@@ -14,8 +14,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-atomics.git",
       "state" : {
-        "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10",
-        "version" : "1.1.0"
+        "revision" : "cd142fd2f64be2100422d658e7411e39489da985",
+        "version" : "1.2.0"
       }
     },
     {
@@ -41,8 +41,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-log.git",
       "state" : {
-        "revision" : "32e8d724467f8fe623624570367e3d50c5638e46",
-        "version" : "1.5.2"
+        "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5",
+        "version" : "1.5.4"
       }
     },
     {
@@ -50,8 +50,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-metrics.git",
       "state" : {
-        "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10",
-        "version" : "2.3.3"
+        "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1",
+        "version" : "2.4.1"
       }
     },
     {
@@ -59,8 +59,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio.git",
       "state" : {
-        "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf",
-        "version" : "2.54.0"
+        "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62",
+        "version" : "2.63.0"
       }
     },
     {
@@ -68,8 +68,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-ssl.git",
       "state" : {
-        "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83",
-        "version" : "2.24.0"
+        "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5",
+        "version" : "2.26.0"
       }
     },
     {
@@ -77,8 +77,17 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-nio-transport-services.git",
       "state" : {
-        "revision" : "41f4098903878418537020075a4d8a6e20a0b182",
-        "version" : "1.17.0"
+        "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce",
+        "version" : "1.20.1"
+      }
+    },
+    {
+      "identity" : "swift-system",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-system.git",
+      "state" : {
+        "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496",
+        "version" : "1.2.1"
       }
     }
   ],
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index a80590daa2..637eb4bc9d 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.9
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index 07e98c586b..004b383749 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index 4cedf56acd..b4f8587eac 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,24 +5,24 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.5.9"
+        "postgresql-client": "2.10.5"
       }
     },
     "node_modules/doublylinked": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz",
-      "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==",
+      "version": "2.5.4",
+      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz",
+      "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==",
       "engines": {
         "node": ">= 10.0"
       }
     },
     "node_modules/lightning-pool": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz",
-      "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==",
+      "version": "4.2.2",
+      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz",
+      "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "putil-promisify": "^1.8.6"
+        "doublylinked": "^2.5.3",
+        "putil-promisify": "^1.10.1"
       }
     },
     "node_modules/obuf": {
@@ -42,16 +42,16 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.5.9",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz",
-      "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==",
+      "version": "2.10.5",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
+      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "lightning-pool": "^4.2.1",
+        "doublylinked": "^2.5.4",
+        "lightning-pool": "^4.2.2",
         "postgres-bytea": "^3.0.0",
-        "power-tasks": "^1.7.0",
-        "putil-merge": "^3.10.3",
-        "putil-promisify": "^1.10.0",
+        "power-tasks": "^1.7.3",
+        "putil-merge": "^3.12.1",
+        "putil-promisify": "^1.10.1",
         "putil-varhelpers": "^1.6.5"
       },
       "engines": {
@@ -60,30 +60,29 @@
       }
     },
     "node_modules/power-tasks": {
-      "version": "1.7.0",
-      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz",
-      "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==",
+      "version": "1.7.3",
+      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz",
+      "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==",
       "dependencies": {
-        "doublylinked": "^2.5.2",
-        "strict-typed-events": "^2.3.1"
+        "doublylinked": "^2.5.4",
+        "strict-typed-events": "^2.3.3"
       },
       "engines": {
-        "node": ">=14.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
       }
     },
     "node_modules/putil-merge": {
-      "version": "3.10.3",
-      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz",
-      "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==",
+      "version": "3.12.1",
+      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz",
+      "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==",
       "engines": {
         "node": ">= 10.0"
       }
     },
     "node_modules/putil-promisify": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz",
-      "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==",
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz",
+      "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==",
       "engines": {
         "node": ">= 14.0"
       }
@@ -97,21 +96,21 @@
       }
     },
     "node_modules/strict-typed-events": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz",
-      "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==",
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz",
+      "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==",
       "dependencies": {
-        "putil-promisify": "^1.8.5",
-        "ts-gems": "^2.2.0"
+        "putil-promisify": "^1.10.1",
+        "ts-gems": "^3.1.0"
       },
       "engines": {
         "node": ">=16.0"
       }
     },
     "node_modules/ts-gems": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz",
-      "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A=="
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz",
+      "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw=="
     }
   }
 }
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 12703ce89f..07ec100d0d 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.5.9"
+    "postgresql-client": "2.10.5"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
index 07e98c586b..004b383749 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index 72cc452817..5a3ad3c238 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,14 +5,14 @@
   "packages": {
     "": {
       "dependencies": {
-        "@neondatabase/serverless": "0.4.18",
-        "ws": "8.13.0"
+        "@neondatabase/serverless": "0.9.0",
+        "ws": "8.16.0"
       }
     },
     "node_modules/@neondatabase/serverless": {
-      "version": "0.4.18",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz",
-      "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==",
+      "version": "0.9.0",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
+      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
       "dependencies": {
         "@types/pg": "8.6.6"
       }
@@ -96,9 +96,9 @@
       }
     },
     "node_modules/ws": {
-      "version": "8.13.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
-      "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
+      "version": "8.16.0",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
+      "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
       "engines": {
         "node": ">=10.0.0"
       },
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 840c7a5c4c..9d9da0f42c 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
   "type": "module",
   "dependencies": {
-    "@neondatabase/serverless": "0.4.18",
-    "ws": "8.13.0"
+    "@neondatabase/serverless": "0.9.0",
+    "ws": "8.16.0"
   }
 }

From 0b330e1310916221b4f43c1e8c53414a68633189 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <chi@neon.tech>
Date: Wed, 6 Mar 2024 12:20:44 -0500
Subject: [PATCH 35/52] upgrade neon extension on startup (#7029)

## Problem

Fix https://github.com/neondatabase/neon/issues/7003. Fix
https://github.com/neondatabase/neon/issues/6982. Currently, neon
extension is only upgraded when new compute spec gets applied, for
example, when creating a new role or creating a new database. This also
resolves `neon.lfc_stat` not found warnings in prod.

## Summary of changes

This pull request adds the logic to spawn a background thread to upgrade
the neon extension version if the compute is a primary. If for whatever
reason the upgrade fails, it reports an error to the console and does
not impact compute node state.

This change can be further applied to 3rd-party extension upgrades. We
can silently upgrade the version of 3rd party extensions in the
background in the future.

Questions:

* Does alter extension takes some kind of lock that will block user
requests?
* Does `ALTER EXTENSION` writes to the database if nothing needs to be
upgraded? (may impact storage size).

Otherwise it's safe to land this pull request.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 43 +++++++++++++++++++++++++++---------
 compute_tools/src/spec.rs    | 12 +++++++++-
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da271e49cd..5613e6c868 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -764,6 +764,26 @@ impl ComputeNode {
         Ok((pg, logs_handle))
     }
 
+    /// Do post configuration of the already started Postgres. This function spawns a background thread to
+    /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
+    /// version. In the future, it may upgrade all 3rd-party extensions.
+    #[instrument(skip_all)]
+    pub fn post_apply_config(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+        thread::spawn(move || {
+            let func = || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_neon_extension_upgrade(&mut client)
+                    .context("handle_neon_extension_upgrade")?;
+                Ok::<_, anyhow::Error>(())
+            };
+            if let Err(err) = func() {
+                error!("error while post_apply_config: {err:#}");
+            }
+        });
+        Ok(())
+    }
+
     /// Do initial configuration of the already started Postgres.
     #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
@@ -998,18 +1018,21 @@ impl ComputeNode {
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
 
         let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-            self.pg_reload_conf()?;
+        if pspec.spec.mode == ComputeMode::Primary {
+            if !pspec.spec.skip_pg_catalog_updates {
+                let pgdata_path = Path::new(&self.pgdata);
+                // temporarily reset max_cluster_size in config
+                // to avoid the possibility of hitting the limit, while we are applying config:
+                // creating new extensions, roles, etc...
+                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+                self.pg_reload_conf()?;
 
-            self.apply_config(&compute_state)?;
+                self.apply_config(&compute_state)?;
 
-            config::compute_ctl_temp_override_remove(pgdata_path)?;
-            self.pg_reload_conf()?;
+                config::compute_ctl_temp_override_remove(pgdata_path)?;
+                self.pg_reload_conf()?;
+            }
+            self.post_apply_config()?;
         }
 
         let startup_end_time = Utc::now();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index d5fd2c9462..84a5a263af 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -744,7 +744,17 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // - extension was just installed
     // - extension was already installed and is up to date
     let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension schema with query: {}", query);
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
     client.simple_query(query)?;
 
     Ok(())

From c2876ec55d985d2820467bd0e248500a29be649c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Mar 2024 12:36:47 +0000
Subject: [PATCH 36/52] proxy http tls investigations (#7045)

## Problem

Some HTTP-specific TLS errors

## Summary of changes

Add more logging, vendor `tls-listener` with minor modifications.
---
 Cargo.lock                           |  15 --
 Cargo.toml                           |   1 -
 proxy/Cargo.toml                     |   1 -
 proxy/src/metrics.rs                 |  10 +-
 proxy/src/protocol2.rs               |  78 +++++++-
 proxy/src/proxy.rs                   |  14 +-
 proxy/src/serverless.rs              |  50 +++--
 proxy/src/serverless/tls_listener.rs | 283 +++++++++++++++++++++++++++
 proxy/src/serverless/websocket.rs    |   6 +
 proxy/src/stream.rs                  |   6 +-
 10 files changed, 418 insertions(+), 46 deletions(-)
 create mode 100644 proxy/src/serverless/tls_listener.rs

diff --git a/Cargo.lock b/Cargo.lock
index 864e5c9046..167a2b2179 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4216,7 +4216,6 @@ dependencies = [
  "thiserror",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
- "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
@@ -5794,20 +5793,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
-[[package]]
-name = "tls-listener"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
-dependencies = [
- "futures-util",
- "hyper",
- "pin-project-lite",
- "thiserror",
- "tokio",
- "tokio-rustls",
-]
-
 [[package]]
 name = "tokio"
 version = "1.36.0"
diff --git a/Cargo.toml b/Cargo.toml
index 90b02b30ec..42deaac19b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -156,7 +156,6 @@ test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
-tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0777d361d2..d8112c8bf0 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -68,7 +68,6 @@ task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 2464b1e611..0477176c45 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,7 +4,7 @@ use ::metrics::{
     register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
     IntCounterVec, IntGauge, IntGaugeVec,
 };
-use metrics::{register_int_counter_pair, IntCounterPair};
+use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
 
 use once_cell::sync::Lazy;
 use tokio::time;
@@ -312,3 +312,11 @@ pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
     )
     .unwrap()
 });
+
+pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "proxy_tls_handshake_failures",
+        "Number of TLS handshake failures",
+    )
+    .unwrap()
+});
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 1d8931be85..3a7aabca32 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,22 +1,27 @@
 //! Proxy Protocol V2 implementation
 
 use std::{
-    future::poll_fn,
-    future::Future,
+    future::{poll_fn, Future},
     io,
     net::SocketAddr,
     pin::{pin, Pin},
+    sync::Mutex,
     task::{ready, Context, Poll},
 };
 
 use bytes::{Buf, BytesMut};
+use hyper::server::accept::Accept;
 use hyper::server::conn::{AddrIncoming, AddrStream};
+use metrics::IntCounterPairGuard;
 use pin_project_lite::pin_project;
-use tls_listener::AsyncAccept;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use uuid::Uuid;
+
+use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
+    pub protocol: &'static str,
 }
 
 pin_project! {
@@ -327,7 +332,7 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
 }
 
 impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithClientIp<AddrStream>;
+    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;
 
     type Error = io::Error;
 
@@ -336,11 +341,74 @@ impl AsyncAccept for ProxyProtocolAccept {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
+        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
         let Some(conn) = conn else {
             return Poll::Ready(None);
         };
 
-        Poll::Ready(Some(Ok(WithClientIp::new(conn))))
+        Poll::Ready(Some(Ok(WithConnectionGuard {
+            inner: WithClientIp::new(conn),
+            connection_id: Uuid::new_v4(),
+            gauge: Mutex::new(Some(
+                NUM_CLIENT_CONNECTION_GAUGE
+                    .with_label_values(&[self.protocol])
+                    .guard(),
+            )),
+        })))
+    }
+}
+
+pin_project! {
+    pub struct WithConnectionGuard<T> {
+        #[pin]
+        pub inner: T,
+        pub connection_id: Uuid,
+        pub gauge: Mutex<Option<IntCounterPairGuard>>,
+    }
+}
+
+impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
+    #[inline]
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write(cx, buf)
+    }
+
+    #[inline]
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_flush(cx)
+    }
+
+    #[inline]
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_shutdown(cx)
+    }
+
+    #[inline]
+    fn poll_write_vectored(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        bufs: &[io::IoSlice<'_>],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write_vectored(cx, bufs)
+    }
+
+    #[inline]
+    fn is_write_vectored(&self) -> bool {
+        self.inner.is_write_vectored()
+    }
+}
+
+impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        self.project().inner.poll_read(cx, buf)
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index d94fc67491..aeba08bc4f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -24,6 +24,7 @@ use crate::{
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
+use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -78,10 +79,16 @@ pub async fn task_main(
     {
         let (socket, peer_addr) = accept_result?;
 
+        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+            .with_label_values(&["tcp"])
+            .guard();
+
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
+        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+
         connections.spawn(async move {
             let mut socket = WithClientIp::new(socket);
             let mut peer_addr = peer_addr.ip();
@@ -116,6 +123,7 @@ pub async fn task_main(
                 socket,
                 ClientMode::Tcp,
                 endpoint_rate_limiter,
+                conn_gauge,
             )
             .instrument(span.clone())
             .await;
@@ -229,13 +237,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    conn_gauge: IntCounterPairGuard,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     info!("handling interactive connection from client");
 
     let proto = ctx.protocol;
-    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&[proto])
-        .guard();
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[proto])
         .guard();
@@ -325,7 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         req: _request_gauge,
-        conn: _client_gauge,
+        conn: conn_gauge,
         cancel: session,
     }))
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b5806aec53..c81ae03b23 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,6 +6,7 @@ mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
+pub mod tls_listener;
 mod websocket;
 
 pub use conn_pool::GlobalConnPoolOptions;
@@ -20,8 +21,8 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
 use crate::context::RequestMonitoring;
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
@@ -98,6 +99,7 @@ pub async fn task_main(
     let _ = addr_incoming.set_nodelay(true);
     let addr_incoming = ProxyProtocolAccept {
         incoming: addr_incoming,
+        protocol: "http",
     };
 
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
@@ -105,18 +107,34 @@ pub async fn task_main(
 
     let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
         if let Err(err) = conn {
-            error!("failed to accept TLS connection for websockets: {err:?}");
+            error!(
+                protocol = "http",
+                "failed to accept TLS connection: {err:?}"
+            );
+            TLS_HANDSHAKE_FAILURES.inc();
             ready(false)
         } else {
+            info!(protocol = "http", "accepted new TLS connection");
             ready(true)
         }
     });
 
     let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, _) = stream.get_ref();
-            let client_addr = io.client_addr();
-            let remote_addr = io.inner.remote_addr();
+        |stream: &tokio_rustls::server::TlsStream<
+            WithConnectionGuard<WithClientIp<AddrStream>>,
+        >| {
+            let (conn, _) = stream.get_ref();
+
+            // this is jank. should dissapear with hyper 1.0 migration.
+            let gauge = conn
+                .gauge
+                .lock()
+                .expect("lock should not be poisoned")
+                .take()
+                .expect("gauge should be set on connection start");
+
+            let client_addr = conn.inner.client_addr();
+            let remote_addr = conn.inner.inner.remote_addr();
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -127,8 +145,8 @@ pub async fn task_main(
                     None if config.require_client_ip => bail!("missing required client ip"),
                     None => remote_addr,
                 };
-                Ok(MetricService::new(hyper::service::service_fn(
-                    move |req: Request<Body>| {
+                Ok(MetricService::new(
+                    hyper::service::service_fn(move |req: Request<Body>| {
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -149,8 +167,9 @@ pub async fn task_main(
                                 .map_or_else(|e| e.into_response(), |r| r),
                             )
                         }
-                    },
-                )))
+                    }),
+                    gauge,
+                ))
             }
         },
     );
@@ -172,13 +191,8 @@ struct MetricService<S> {
 }
 
 impl<S> MetricService<S> {
-    fn new(inner: S) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge: NUM_CLIENT_CONNECTION_GAUGE
-                .with_label_values(&["http"])
-                .guard(),
-        }
+    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
+        MetricService { inner, _gauge }
     }
 }
 
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
new file mode 100644
index 0000000000..6196ff393c
--- /dev/null
+++ b/proxy/src/serverless/tls_listener.rs
@@ -0,0 +1,283 @@
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+    time::Duration,
+};
+
+use futures::{Future, Stream, StreamExt};
+use pin_project_lite::pin_project;
+use thiserror::Error;
+use tokio::{
+    io::{AsyncRead, AsyncWrite},
+    task::JoinSet,
+    time::timeout,
+};
+
+/// Default timeout for the TLS handshake.
+pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Trait for TLS implementation.
+///
+/// Implementations are provided by the rustls and native-tls features.
+pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
+    /// The type of the TLS stream created from the underlying stream.
+    type Stream: Send + 'static;
+    /// Error type for completing the TLS handshake
+    type Error: std::error::Error + Send + 'static;
+    /// Type of the Future for the TLS stream that is accepted.
+    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
+
+    /// Accept a TLS connection on an underlying stream
+    fn accept(&self, stream: C) -> Self::AcceptFuture;
+}
+
+/// Asynchronously accept connections.
+pub trait AsyncAccept {
+    /// The type of the connection that is accepted.
+    type Connection: AsyncRead + AsyncWrite;
+    /// The type of error that may be returned.
+    type Error;
+
+    /// Poll to accept the next connection.
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
+
+    /// Return a new `AsyncAccept` that stops accepting connections after
+    /// `ender` completes.
+    ///
+    /// Useful for graceful shutdown.
+    ///
+    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
+    /// for example of how to use.
+    fn until<F: Future>(self, ender: F) -> Until<Self, F>
+    where
+        Self: Sized,
+    {
+        Until {
+            acceptor: self,
+            ender,
+        }
+    }
+}
+
+pin_project! {
+    ///
+    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
+    /// encrypted using TLS.
+    ///
+    /// It is similar to:
+    ///
+    /// ```ignore
+    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
+    /// ```
+    ///
+    /// except that it has the ability to accept multiple transport-level connections
+    /// simultaneously while the TLS handshake is pending for other connections.
+    ///
+    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
+    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
+    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
+    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
+    ///
+    /// Note that if the maximum number of pending connections is greater than 1, the resulting
+    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
+    /// underlying listener.
+    ///
+    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
+    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
+    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
+    /// [4]: AsyncTls::Stream
+    ///
+    #[allow(clippy::type_complexity)]
+    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
+        #[pin]
+        listener: A,
+        tls: T,
+        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
+        timeout: Duration,
+    }
+}
+
+/// Builder for `TlsListener`.
+#[derive(Clone)]
+pub struct Builder<T> {
+    tls: T,
+    handshake_timeout: Duration,
+}
+
+/// Wraps errors from either the listener or the TLS Acceptor
+#[derive(Debug, Error)]
+pub enum Error<LE: std::error::Error, TE: std::error::Error> {
+    /// An error that arose from the listener ([AsyncAccept::Error])
+    #[error("{0}")]
+    ListenerError(#[source] LE),
+    /// An error that occurred during the TLS accept handshake
+    #[error("{0}")]
+    TlsAcceptError(#[source] TE),
+}
+
+impl<A: AsyncAccept, T> TlsListener<A, T>
+where
+    T: AsyncTls<A::Connection>,
+{
+    /// Create a `TlsListener` with default options.
+    pub fn new(tls: T, listener: A) -> Self {
+        builder(tls).listen(listener)
+    }
+}
+
+impl<A, T> TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    /// Accept the next connection
+    ///
+    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
+    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
+    where
+        Self: Unpin,
+    {
+        self.next().await
+    }
+
+    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor(&mut self, acceptor: T) {
+        self.tls = acceptor;
+    }
+
+    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
+    ///
+    /// This is useful if your listener is `!Unpin`.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
+        *self.project().tls = acceptor;
+    }
+}
+
+impl<A, T> Stream for TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut this = self.project();
+
+        loop {
+            match this.listener.as_mut().poll_accept(cx) {
+                Poll::Pending => break,
+                Poll::Ready(Some(Ok(conn))) => {
+                    this.waiting
+                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                }
+                Poll::Ready(None) => return Poll::Ready(None),
+            }
+        }
+
+        loop {
+            return match this.waiting.poll_join_next(cx) {
+                Poll::Ready(Some(Ok(Ok(conn)))) => {
+                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                }
+                // The handshake timed out, try getting another connection from the queue
+                Poll::Ready(Some(Ok(Err(_)))) => continue,
+                // The handshake panicked
+                Poll::Ready(Some(Err(e))) if e.is_panic() => {
+                    std::panic::resume_unwind(e.into_panic())
+                }
+                // The handshake was externally aborted
+                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
+                _ => Poll::Pending,
+            };
+        }
+    }
+}
+
+impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
+    type Stream = tokio_rustls::server::TlsStream<C>;
+    type Error = std::io::Error;
+    type AcceptFuture = tokio_rustls::Accept<C>;
+
+    fn accept(&self, conn: C) -> Self::AcceptFuture {
+        tokio_rustls::TlsAcceptor::accept(self, conn)
+    }
+}
+
+impl<T> Builder<T> {
+    /// Set the timeout for handshakes.
+    ///
+    /// If a timeout takes longer than `timeout`, then the handshake will be
+    /// aborted and the underlying connection will be dropped.
+    ///
+    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
+    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
+        self.handshake_timeout = timeout;
+        self
+    }
+
+    /// Create a `TlsListener` from the builder
+    ///
+    /// Actually build the `TlsListener`. The `listener` argument should be
+    /// an implementation of the `AsyncAccept` trait that accepts new connections
+    /// that the `TlsListener` will  encrypt using TLS.
+    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
+    where
+        T: AsyncTls<A::Connection>,
+    {
+        TlsListener {
+            listener,
+            tls: self.tls.clone(),
+            waiting: JoinSet::new(),
+            timeout: self.handshake_timeout,
+        }
+    }
+}
+
+/// Create a new Builder for a TlsListener
+///
+/// `server_config` will be used to configure the TLS sessions.
+pub fn builder<T>(tls: T) -> Builder<T> {
+    Builder {
+        tls,
+        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
+    }
+}
+
+pin_project! {
+    /// See [`AsyncAccept::until`]
+    pub struct Until<A, E> {
+        #[pin]
+        acceptor: A,
+        #[pin]
+        ender: E,
+    }
+}
+
+impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
+    type Connection = A::Connection;
+    type Error = A::Error;
+
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+        let this = self.project();
+
+        match this.ender.poll(cx) {
+            Poll::Pending => this.acceptor.poll_accept(cx),
+            Poll::Ready(_) => Poll::Ready(None),
+        }
+    }
+}
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 24f2bb7e8c..a72ede6d0a 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,6 +3,7 @@ use crate::{
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
+    metrics::NUM_CLIENT_CONNECTION_GAUGE,
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -138,6 +139,10 @@ pub async fn serve_websocket(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
+    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+        .with_label_values(&["ws"])
+        .guard();
+
     let res = handle_client(
         config,
         &mut ctx,
@@ -145,6 +150,7 @@ pub async fn serve_websocket(
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
+        conn_gauge,
     )
     .await;
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 0d639d2c07..b6b7a85659 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -224,7 +225,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
     /// If possible, upgrade raw stream into a secure TLS-based stream.
     pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
         match self {
-            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?),
+            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
+                .accept(raw)
+                .await
+                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From d03ec9d9983554ebf5d0a2ee182536b6c267ff98 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 12:37:52 +0000
Subject: [PATCH 37/52] pageserver: don't validate vectored get on shut-down
 (#7039)

## Problem
We attempted validation for cancelled errors under the assumption that
if vectored get fails, sequential get will too.
That's not right 100% of times though because sequential get may have
the values cached and slip them through
even when shutting down.

## Summary of changes
Don't validate if either search impl failed due to tenant shutdown.
---
 pageserver/src/tenant/timeline.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 37acebb10a..7ac7c15876 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -863,8 +863,6 @@ impl Timeline {
         fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
             use GetVectoredError::*;
             match (lhs, rhs) {
-                (Cancelled, Cancelled) => true,
-                (_, Cancelled) => true,
                 (Oversized(l), Oversized(r)) => l == r,
                 (InvalidLsn(l), InvalidLsn(r)) => l == r,
                 (MissingKey(l), MissingKey(r)) => l == r,
@@ -875,6 +873,8 @@ impl Timeline {
         }
 
         match (&sequential_res, vectored_res) {
+            (Err(GetVectoredError::Cancelled), _) => {},
+            (_, Err(GetVectoredError::Cancelled)) => {},
             (Err(seq_err), Ok(_)) => {
                 panic!(concat!("Sequential get failed with {}, but vectored get did not",
                                " - keyspace={:?} lsn={}"),

From d3c583efbe2a5f736ae43da4de84479ec4ee81b4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Mar 2024 14:06:48 +0000
Subject: [PATCH 38/52] Rename binary attachment_service -> storage_controller
 (#7042)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

The storage controller binary still has its historic
`attachment_service` name -- it will be painful to change this later
because we can't atomically update this repo and the helm charts used to
deploy.

Companion helm chart change:
https://github.com/neondatabase/helm-charts/pull/70

## Summary of changes

- Change the name of the binary to `storage_controller`
- Skipping renaming things in the source right now: this is just to get
rid of the legacy name in external interfaces.

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 Dockerfile                                  | 4 ++--
 control_plane/attachment_service/Cargo.toml | 4 ++++
 control_plane/src/attachment_service.rs     | 2 +-
 control_plane/src/local_env.rs              | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 47954a671b..5f82df3e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN set -e \
       --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
-      --bin attachment_service  \
+      --bin storage_controller  \
       --bin proxy  \
       --bin neon_local \
       --locked --release \
@@ -81,7 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index bfdfd4c77d..a5fad7216c 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[[bin]]
+name = "storage_controller"
+path = "src/main.rs"
+
 [features]
 default = []
 # Enables test-only APIs and behaviors
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 610d7386d9..5c97561985 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -34,7 +34,7 @@ pub struct AttachmentService {
     client: reqwest::Client,
 }
 
-const COMMAND: &str = "attachment_service";
+const COMMAND: &str = "storage_controller";
 
 const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index a5e1325cfe..03270723a6 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -232,7 +232,7 @@ impl LocalEnv {
         // run from the same location as neon_local.  This means that for compatibility
         // tests that run old pageserver/safekeeper, they still run latest attachment service.
         let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
-        neon_local_bin_dir.join("attachment_service")
+        neon_local_bin_dir.join("storage_controller")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {

From 602a4da9a5cdfac7f04509950704da811f08b968 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 7 Mar 2024 16:23:42 +0200
Subject: [PATCH 39/52] bench: run branch_creation_many at 500, seeded (#6959)

We have a benchmark for creating a lot of branches, but it does random
things, and the branch count is not what we is the largest maximum we
aim to support. If this PR would stabilize the benchmark total duration
it means that there are some structures which are very much slower than
others. Then we should add a seed-outputting variant to help find and
reproduce such cases.

Additionally, record for the benchmark:
- shutdown duration
- startup metrics once done (on restart)
- duration of first compaction completion via debug logging
---
 pageserver/src/tenant/tasks.rs                |   7 +-
 .../performance/test_branch_creation.py       | 110 ++++++++++++++++--
 2 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 57c3edcddd..e4f5f75132 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -101,6 +101,7 @@ pub fn start_background_loops(
                     _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                 };
                 compaction_loop(tenant, cancel)
+                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                     .await;
                 Ok(())
@@ -198,7 +199,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            let elapsed = started_at.elapsed();
+            warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);
+
+            // the duration is recorded by performance tests by enabling debug in this function
+            tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 6edcb8f1f2..9777bf6748 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -1,4 +1,5 @@
 import random
+import re
 import statistics
 import threading
 import time
@@ -7,11 +8,14 @@ from contextlib import closing
 from typing import List
 
 import pytest
-from fixtures.benchmark_fixture import MetricReport
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonPageserver
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.types import Lsn
+from fixtures.utils import wait_until
+from prometheus_client.samples import Sample
 
 
 def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]):
@@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
 
-@pytest.mark.parametrize("n_branches", [1024])
-# Test measures the latency of branch creation when creating a lot of branches.
-def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
+@pytest.mark.parametrize("n_branches", [500, 1024])
+@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
+def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
+    """
+    Test measures the latency of branch creation when creating a lot of branches.
+    """
     env = neon_compare.env
 
+    # seed the prng so we will measure the same structure every time
+    rng = random.Random("2024-02-29")
+
     env.neon_cli.create_branch("b0")
 
     endpoint = env.endpoints.create_start("b0")
@@ -102,15 +112,101 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):
     branch_creation_durations = []
 
     for i in range(n_branches):
-        # random a source branch
-        p = random.randint(0, i)
+        if shape == "random":
+            parent = f"b{rng.randint(0, i)}"
+        elif shape == "one_ancestor":
+            parent = "b0"
+        else:
+            raise RuntimeError(f"unimplemented shape: {shape}")
+
         timer = timeit.default_timer()
-        env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p))
+        # each of these uploads to remote storage before completion
+        env.neon_cli.create_branch(f"b{i + 1}", parent)
         dur = timeit.default_timer() - timer
         branch_creation_durations.append(dur)
 
     _record_branch_creation_durations(neon_compare, branch_creation_durations)
 
+    endpoint.stop_and_destroy()
+
+    with neon_compare.record_duration("shutdown"):
+        # this sleeps 100ms between polls
+        env.pageserver.stop()
+
+    startup_line = "INFO version: git(-env)?:"
+
+    # find the first line of the log file so we can find the next start later
+    _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))
+
+    # start without gc so we can time compaction with less noise; use shorter
+    # period for compaction so it starts earlier
+    env.pageserver.start(
+        overrides=(
+            "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }",
+        ),
+        # this does print more than we want, but the number should be comparable between runs
+        extra_env_vars={
+            "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info"
+        },
+    )
+
+    _, second_start = wait_until(
+        5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
+    )
+    env.pageserver.quiesce_tenants()
+
+    wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after")
+
+    # wait for compaction to complete, which most likely has already done so multiple times
+    msg, _ = wait_until(
+        30,
+        1,
+        lambda: env.pageserver.assert_log_contains(
+            f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
+        ),
+    )
+    needle = re.search(" elapsed_ms=([0-9]+)", msg)
+    assert needle is not None, "failed to find the elapsed time"
+    duration = int(needle.group(1)) / 1000.0
+    neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER)
+
+
+def wait_and_record_startup_metrics(
+    pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str
+):
+    """
+    Waits until all startup metrics have non-zero values on the pageserver, then records them on the target
+    """
+
+    client = pageserver.http_client()
+
+    expected_labels = set(
+        [
+            "background_jobs_can_start",
+            "complete",
+            "initial",
+            "initial_tenant_load",
+            "initial_tenant_load_remote",
+        ]
+    )
+
+    def metrics_are_filled() -> List[Sample]:
+        m = client.get_metrics()
+        samples = m.query_all("pageserver_startup_duration_seconds")
+        # we should not have duplicate labels
+        matching = [
+            x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0
+        ]
+        assert len(matching) == len(expected_labels)
+        return matching
+
+    samples = wait_until(10, 1, metrics_are_filled)
+
+    for sample in samples:
+        phase = sample.labels["phase"]
+        name = f"{prefix}.{phase}"
+        target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER)
+
 
 # Test measures the branch creation time when branching from a timeline with a lot of relations.
 #

From 871977f14c2ca93f736a82c07da93a3c142d0ab0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 16:02:20 +0000
Subject: [PATCH 40/52] pageserver: fix early bail out in vectored get (#7038)

## Problem
When vectored get encountered a portion of the key range that could
not be mapped to any layer in the current timeline it would incorrectly
bail out of the current timeline. This is incorrect since we may have
had layers queued for a visit in the fringe.

## Summary of changes
* Add a repro unit test
* Remove the early bail out path
* Simplify range search return value
---
 pageserver/src/tenant.rs           | 165 +++++++++++++++++++++++++++--
 pageserver/src/tenant/layer_map.rs |  24 +++--
 pageserver/src/tenant/timeline.rs  |   9 +-
 3 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b24c06c4da..2f23e535fa 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3679,7 +3679,10 @@ pub(crate) mod harness {
     }
 
     impl TenantHarness {
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub fn create_custom(
+            test_name: &'static str,
+            tenant_conf: TenantConf,
+        ) -> anyhow::Result<Self> {
             setup_logging();
 
             let repo_dir = PageServerConf::test_repo_dir(test_name);
@@ -3691,14 +3694,6 @@ pub(crate) mod harness {
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-            // Disable automatic GC and compaction to make the unit tests more deterministic.
-            // The tests perform them manually if needed.
-            let tenant_conf = TenantConf {
-                gc_period: Duration::ZERO,
-                compaction_period: Duration::ZERO,
-                ..TenantConf::default()
-            };
-
             let tenant_id = TenantId::generate();
             let tenant_shard_id = TenantShardId::unsharded(tenant_id);
             fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
@@ -3726,6 +3721,18 @@ pub(crate) mod harness {
             })
         }
 
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            // Disable automatic GC and compaction to make the unit tests more deterministic.
+            // The tests perform them manually if needed.
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };
+
+            Self::create_custom(test_name, tenant_conf)
+        }
+
         pub fn span(&self) -> tracing::Span {
             info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
         }
@@ -3833,6 +3840,7 @@ mod tests {
     use crate::keyspace::KeySpaceAccum;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
+    use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
@@ -4637,6 +4645,145 @@ mod tests {
         Ok(())
     }
 
+    // Test that vectored get handles layer gaps correctly
+    // by advancing into the next ancestor timeline if required.
+    //
+    // The test generates timelines that look like the diagram below.
+    // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram).
+    // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram).
+    //
+    // ```
+    //-------------------------------+
+    //                          ...  |
+    //               [   L1   ]      |
+    //     [ / L1   ]                | Child Timeline
+    // ...                           |
+    // ------------------------------+
+    //     [ X L1   ]                | Parent Timeline
+    // ------------------------------+
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
+        let tenant_conf = TenantConf {
+            // Make compaction deterministic
+            gc_period: Duration::ZERO,
+            compaction_period: Duration::ZERO,
+            // Encourage creation of L1 layers
+            checkpoint_distance: 16 * 1024,
+            compaction_target_size: 8 * 1024,
+            ..TenantConf::default()
+        };
+
+        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let gap_at_key = current_key.add(100);
+        let mut current_lsn = Lsn(0x10);
+
+        const KEY_COUNT: usize = 10_000;
+
+        let timeline_id = TimelineId::generate();
+        let current_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        let writer = current_timeline.writer().await;
+        writer
+            .put(
+                gap_at_key,
+                current_lsn,
+                &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
+                &ctx,
+            )
+            .await?;
+        writer.finish_write(current_lsn);
+        drop(writer);
+
+        let mut latest_lsns = HashMap::new();
+        latest_lsns.insert(gap_at_key, current_lsn);
+
+        current_timeline.freeze_and_flush().await?;
+
+        let child_timeline_id = TimelineId::generate();
+
+        tenant
+            .branch_timeline_test(
+                &current_timeline,
+                child_timeline_id,
+                Some(current_lsn),
+                &ctx,
+            )
+            .await?;
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        for i in 0..KEY_COUNT {
+            if current_key == gap_at_key {
+                current_key = current_key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let writer = child_timeline.writer().await;
+            writer
+                .put(
+                    current_key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+            drop(writer);
+
+            latest_lsns.insert(current_key, current_lsn);
+            current_key = current_key.next();
+
+            // Flush every now and then to encourage layer file creation.
+            if i % 500 == 0 {
+                child_timeline.freeze_and_flush().await?;
+            }
+        }
+
+        child_timeline.freeze_and_flush().await?;
+        let mut flags = EnumSet::new();
+        flags.insert(CompactFlags::ForceRepartition);
+        child_timeline
+            .compact(&CancellationToken::new(), flags, &ctx)
+            .await?;
+
+        let key_near_end = {
+            let mut tmp = current_key;
+            tmp.field6 -= 10;
+            tmp
+        };
+
+        let key_near_gap = {
+            let mut tmp = gap_at_key;
+            tmp.field6 -= 10;
+            tmp
+        };
+
+        let read = KeySpace {
+            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
+        };
+        let results = child_timeline
+            .get_vectored_impl(read.clone(), current_lsn, &ctx)
+            .await?;
+
+        for (key, img_res) in results {
+            let expected = test_img(&format!("{} at {}", key, latest_lsns[&key]));
+            assert_eq!(img_res?, expected);
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_random_updates")?;
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 5f4814cc6b..b8ed69052f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -460,15 +460,22 @@ impl LayerMap {
         }
     }
 
-    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
-        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
+        let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
+            Some(version) => version,
+            None => {
+                let mut result = RangeSearchResult::new();
+                result.not_found.add_range(key_range);
+                return result;
+            }
+        };
 
         let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
         let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
         let image_changes = version.image_coverage.range_overlaps(&raw_range);
 
         let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
-        Some(collector.collect())
+        collector.collect()
     }
 
     /// Start a batch of updates, applied on drop
@@ -995,8 +1002,13 @@ mod tests {
         let layer_map = LayerMap::default();
         let range = Key::from_i128(100)..Key::from_i128(200);
 
-        let res = layer_map.range_search(range, Lsn(100));
-        assert!(res.is_none());
+        let res = layer_map.range_search(range.clone(), Lsn(100));
+        assert_eq!(
+            res.not_found.to_keyspace(),
+            KeySpace {
+                ranges: vec![range]
+            }
+        );
     }
 
     #[test]
@@ -1033,7 +1045,7 @@ mod tests {
         for start in 0..60 {
             for end in (start + 1)..60 {
                 let range = Key::from_i128(start)..Key::from_i128(end);
-                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let result = layer_map.range_search(range.clone(), Lsn(100));
                 let expected = brute_force_range_search(&layer_map, range, Lsn(100));
 
                 assert_range_search_result_eq(result, expected);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7ac7c15876..71a958206c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2784,7 +2784,7 @@ impl Timeline {
         let guard = timeline.layers.read().await;
         let layers = guard.layer_map();
 
-        'outer: loop {
+        loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
@@ -2810,12 +2810,7 @@ impl Timeline {
                 }
                 None => {
                     for range in unmapped_keyspace.ranges.iter() {
-                        let results = match layers.range_search(range.clone(), cont_lsn) {
-                            Some(res) => res,
-                            None => {
-                                break 'outer;
-                            }
-                        };
+                        let results = layers.range_search(range.clone(), cont_lsn);
 
                         results
                             .found

From d5a6a2a16d7e63d21ef00b3d582da57485f42d06 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 7 Mar 2024 17:10:03 +0000
Subject: [PATCH 41/52] storage controller: robustness improvements (#7027)

## Problem


Closes: https://github.com/neondatabase/neon/issues/6847
Closes: https://github.com/neondatabase/neon/issues/7006

## Summary of changes

- Pageserver API calls are wrapped in timeout/retry logic: this prevents
a reconciler getting hung on a pageserver API hang, and prevents
reconcilers having to totally retry if one API call returns a retryable
error (e.g. 503).
- Add a cancellation token to `Node`, so that when we mark a node
offline we will cancel any API calls in progress to that node, and avoid
issuing any more API calls to that offline node.
- If the dirty locations of a shard are all on offline nodes, then don't
spawn a reconciler
- In re-attach, if we have no observed state object for a tenant then
construct one with conf: None (which means "unknown"). Then in
Reconciler, implement a TODO for scanning such locations before running,
so that we will avoid spuriously incrementing a generation in the case
of a node that was offline while we started (this is the case that
tripped up #7006)
- Refactoring: make Node contents private (and thereby guarantee that
updates to availability mode reliably update the cancellation token.)
- Refactoring: don't pass the whole map of nodes into Reconciler (and
thereby remove a bunch of .expect() calls)

Some of this was discovered/tested with a new failure injection test
that will come in a separate PR, once it is stable enough for CI.
---
 control_plane/attachment_service/src/node.rs  | 218 ++++++++++-
 .../attachment_service/src/reconciler.rs      | 356 +++++++++++-------
 .../attachment_service/src/scheduler.rs       |  30 +-
 .../attachment_service/src/service.rs         | 348 ++++++++---------
 .../attachment_service/src/tenant_state.rs    | 129 +++++--
 pageserver/client/src/mgmt_api.rs             |  20 +-
 pageserver/src/http/routes.rs                 |  27 ++
 pageserver/src/tenant/mgr.rs                  |  10 +
 8 files changed, 749 insertions(+), 389 deletions(-)

diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 1f9dcef033..27b03608fa 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,16 @@
-use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
+use std::{str::FromStr, time::Duration};
+
+use hyper::StatusCode;
+use pageserver_api::{
+    controller_api::{
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
+    },
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api;
 use serde::Serialize;
-use utils::id::NodeId;
+use tokio_util::sync::CancellationToken;
+use utils::{backoff, id::NodeId};
 
 use crate::persistence::NodePersistence;
 
@@ -12,16 +22,29 @@ use crate::persistence::NodePersistence;
 /// implementation of serialization on this type is only for debug dumps.
 #[derive(Clone, Serialize)]
 pub(crate) struct Node {
-    pub(crate) id: NodeId,
+    id: NodeId,
 
-    pub(crate) availability: NodeAvailability,
-    pub(crate) scheduling: NodeSchedulingPolicy,
+    availability: NodeAvailability,
+    scheduling: NodeSchedulingPolicy,
 
-    pub(crate) listen_http_addr: String,
-    pub(crate) listen_http_port: u16,
+    listen_http_addr: String,
+    listen_http_port: u16,
 
-    pub(crate) listen_pg_addr: String,
-    pub(crate) listen_pg_port: u16,
+    listen_pg_addr: String,
+    listen_pg_port: u16,
+
+    // This cancellation token means "stop any RPCs in flight to this node, and don't start
+    // any more". It is not related to process shutdown.
+    #[serde(skip)]
+    cancel: CancellationToken,
+}
+
+/// When updating [`Node::availability`] we use this type to indicate to the caller
+/// whether/how they changed it.
+pub(crate) enum AvailabilityTransition {
+    ToActive,
+    ToOffline,
+    Unchanged,
 }
 
 impl Node {
@@ -29,6 +52,71 @@ impl Node {
         format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
     }
 
+    pub(crate) fn get_id(&self) -> NodeId {
+        self.id
+    }
+
+    pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
+        self.scheduling = scheduling
+    }
+
+    /// Does this registration request match `self`?  This is used when deciding whether a registration
+    /// request should be allowed to update an existing record with the same node ID.
+    pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
+        self.id == register_req.node_id
+            && self.listen_http_addr == register_req.listen_http_addr
+            && self.listen_http_port == register_req.listen_http_port
+            && self.listen_pg_addr == register_req.listen_pg_addr
+            && self.listen_pg_port == register_req.listen_pg_port
+    }
+
+    /// For a shard located on this node, populate a response object
+    /// with this node's address information.
+    pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
+        TenantLocateResponseShard {
+            shard_id,
+            node_id: self.id,
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port,
+        }
+    }
+
+    pub(crate) fn set_availability(
+        &mut self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use NodeAvailability::*;
+        let transition = match (self.availability, availability) {
+            (Offline, Active) => {
+                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
+                // users of previously-cloned copies of the node will still see the old cancellation
+                // state.  For example, Reconcilers in flight will have to complete and be spawned
+                // again to realize that the node has become available.
+                self.cancel = CancellationToken::new();
+                AvailabilityTransition::ToActive
+            }
+            (Active, Offline) => {
+                // Fire the node's cancellation token to cancel any in-flight API requests to it
+                self.cancel.cancel();
+                AvailabilityTransition::ToOffline
+            }
+            _ => AvailabilityTransition::Unchanged,
+        };
+        self.availability = availability;
+        transition
+    }
+
+    /// Whether we may send API requests to this node.
+    pub(crate) fn is_available(&self) -> bool {
+        // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds
+        // a reference to the original Node's cancellation status.  Checking both of these results
+        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
+        // when we cloned it, or if the original Node instance's cancellation token was fired.
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+    }
+
     /// Is this node elegible to have work scheduled onto it?
     pub(crate) fn may_schedule(&self) -> bool {
         match self.availability {
@@ -44,6 +132,26 @@ impl Node {
         }
     }
 
+    pub(crate) fn new(
+        id: NodeId,
+        listen_http_addr: String,
+        listen_http_port: u16,
+        listen_pg_addr: String,
+        listen_pg_port: u16,
+    ) -> Self {
+        Self {
+            id,
+            listen_http_addr,
+            listen_http_port,
+            listen_pg_addr,
+            listen_pg_port,
+            scheduling: NodeSchedulingPolicy::Filling,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Active,
+            cancel: CancellationToken::new(),
+        }
+    }
+
     pub(crate) fn to_persistent(&self) -> NodePersistence {
         NodePersistence {
             node_id: self.id.0 as i64,
@@ -54,4 +162,96 @@ impl Node {
             listen_pg_port: self.listen_pg_port as i32,
         }
     }
+
+    pub(crate) fn from_persistent(np: NodePersistence) -> Self {
+        Self {
+            id: NodeId(np.node_id as u64),
+            // At startup we consider a node offline until proven otherwise.
+            availability: NodeAvailability::Offline,
+            scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy)
+                .expect("Bad scheduling policy in DB"),
+            listen_http_addr: np.listen_http_addr,
+            listen_http_port: np.listen_http_port as u16,
+            listen_pg_addr: np.listen_pg_addr,
+            listen_pg_port: np.listen_pg_port as u16,
+            cancel: CancellationToken::new(),
+        }
+    }
+
+    /// Wrapper for issuing requests to pageserver management API: takes care of generic
+    /// retry/backoff for retryable HTTP status codes.
+    ///
+    /// This will return None to indicate cancellation.  Cancellation may happen from
+    /// the cancellation token passed in, or from Self's cancellation token (i.e. node
+    /// going offline).
+    pub(crate) async fn with_client_retries<T, O, F>(
+        &self,
+        mut op: O,
+        jwt: &Option<String>,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> Option<mgmt_api::Result<T>>
+    where
+        O: FnMut(mgmt_api::Client) -> F,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        fn is_fatal(e: &mgmt_api::Error) -> bool {
+            use mgmt_api::Error::*;
+            match e {
+                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                ApiError(_, _) => true,
+                Cancelled => true,
+            }
+        }
+
+        backoff::retry(
+            || {
+                let http_client = reqwest::ClientBuilder::new()
+                    .timeout(timeout)
+                    .build()
+                    .expect("Failed to construct HTTP client");
+
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+
+                let node_cancel_fut = self.cancel.cancelled();
+
+                let op_fut = op(client);
+
+                async {
+                    tokio::select! {
+                        r = op_fut=> {r},
+                        _ = node_cancel_fut => {
+                        Err(mgmt_api::Error::Cancelled)
+                    }}
+                }
+            },
+            is_fatal,
+            warn_threshold,
+            max_retries,
+            &format!(
+                "Call to node {} ({}:{}) management API",
+                self.id, self.listen_http_addr, self.listen_http_port
+            ),
+            cancel,
+        )
+        .await
+    }
+}
+
+impl std::fmt::Display for Node {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.id, self.listen_http_addr)
+    }
+}
+
+impl std::fmt::Debug for Node {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.id, self.listen_http_addr)
+    }
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 0fa6e8e2f8..603da9bf02 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,6 +1,5 @@
 use crate::persistence::Persistence;
 use crate::service;
-use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -28,15 +27,16 @@ pub(super) struct Reconciler {
     pub(crate) shard: ShardIdentity,
     pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
+
+    /// Nodes not referenced by [`Self::intent`], from which we should try
+    /// to detach this tenant shard.
+    pub(crate) detach: Vec<Node>,
+
     pub(crate) config: TenantConfig,
     pub(crate) observed: ObservedState,
 
     pub(crate) service_config: service::Config,
 
-    /// A snapshot of the pageservers as they were when we were asked
-    /// to reconcile.
-    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
-
     /// A hook to notify the running postgres instances when we change the location
     /// of a tenant.  Use this via [`Self::compute_notify`] to update our failure flag
     /// and guarantee eventual retries.
@@ -67,29 +67,37 @@ pub(super) struct Reconciler {
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
 pub(crate) struct TargetState {
-    pub(crate) attached: Option<NodeId>,
-    pub(crate) secondary: Vec<NodeId>,
+    pub(crate) attached: Option<Node>,
+    pub(crate) secondary: Vec<Node>,
 }
 
 impl TargetState {
-    pub(crate) fn from_intent(intent: &IntentState) -> Self {
+    pub(crate) fn from_intent(nodes: &HashMap<NodeId, Node>, intent: &IntentState) -> Self {
         Self {
-            attached: *intent.get_attached(),
-            secondary: intent.get_secondary().clone(),
+            attached: intent.get_attached().map(|n| {
+                nodes
+                    .get(&n)
+                    .expect("Intent attached referenced non-existent node")
+                    .clone()
+            }),
+            secondary: intent
+                .get_secondary()
+                .iter()
+                .map(|n| {
+                    nodes
+                        .get(n)
+                        .expect("Intent secondary referenced non-existent node")
+                        .clone()
+                })
+                .collect(),
         }
     }
-
-    fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = self.secondary.clone();
-        if let Some(node_id) = &self.attached {
-            result.push(*node_id);
-        }
-        result
-    }
 }
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileError {
+    #[error(transparent)]
+    Remote(#[from] mgmt_api::Error),
     #[error(transparent)]
     Notify(#[from] NotifyError),
     #[error("Cancelled")]
@@ -101,45 +109,83 @@ pub(crate) enum ReconcileError {
 impl Reconciler {
     async fn location_config(
         &mut self,
-        node_id: NodeId,
+        node: &Node,
         config: LocationConfig,
         flush_ms: Option<Duration>,
         lazy: bool,
-    ) -> anyhow::Result<()> {
-        let node = self
-            .pageservers
-            .get(&node_id)
-            .expect("Pageserver may not be removed while referenced");
+    ) -> Result<(), ReconcileError> {
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: None });
+
+        // TODO: amend locations that use long-polling: they will hit this timeout.
+        let timeout = Duration::from_secs(25);
+
+        tracing::info!("location_config({node}) calling: {:?}", config);
+        let tenant_shard_id = self.tenant_shard_id;
+        let config_ref = &config;
+        match node
+            .with_client_retries(
+                |client| async move {
+                    let config = config_ref.clone();
+                    client
+                        .location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
+                        .await
+                },
+                &self.service_config.jwt_token,
+                1,
+                3,
+                timeout,
+                &self.cancel,
+            )
+            .await
+        {
+            Some(Ok(_)) => {}
+            Some(Err(e)) => return Err(e.into()),
+            None => return Err(ReconcileError::Cancel),
+        };
+        tracing::info!("location_config({node}) complete: {:?}", config);
 
         self.observed
             .locations
-            .insert(node.id, ObservedStateLocation { conf: None });
-
-        tracing::info!("location_config({}) calling: {:?}", node_id, config);
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-        client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
-            .await?;
-        tracing::info!("location_config({}) complete: {:?}", node_id, config);
-
-        self.observed
-            .locations
-            .insert(node.id, ObservedStateLocation { conf: Some(config) });
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
 
         Ok(())
     }
 
+    fn get_node(&self, node_id: &NodeId) -> Option<&Node> {
+        if let Some(node) = self.intent.attached.as_ref() {
+            if node.get_id() == *node_id {
+                return Some(node);
+            }
+        }
+
+        if let Some(node) = self
+            .intent
+            .secondary
+            .iter()
+            .find(|n| n.get_id() == *node_id)
+        {
+            return Some(node);
+        }
+
+        if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) {
+            return Some(node);
+        }
+
+        None
+    }
+
     async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
-        let destination = if let Some(node_id) = self.intent.attached {
-            match self.observed.locations.get(&node_id) {
+        let destination = if let Some(node) = &self.intent.attached {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) => {
                     // We will do a live migration only if the intended destination is not
                     // currently in an attached state.
                     match &conf.conf {
                         Some(conf) if conf.mode == LocationConfigMode::Secondary => {
                             // Fall through to do a live migration
-                            node_id
+                            node
                         }
                         None | Some(_) => {
                             // Attached or uncertain: don't do a live migration, proceed
@@ -152,7 +198,7 @@ impl Reconciler {
                 None => {
                     // Our destination is not attached: maybe live migrate if some other
                     // node is currently attached.  Fall through.
-                    node_id
+                    node
                 }
             }
         } else {
@@ -165,15 +211,13 @@ impl Reconciler {
         for (node_id, state) in &self.observed.locations {
             if let Some(observed_conf) = &state.conf {
                 if observed_conf.mode == LocationConfigMode::AttachedSingle {
-                    let node = self
-                        .pageservers
-                        .get(node_id)
-                        .expect("Nodes may not be removed while referenced");
                     // We will only attempt live migration if the origin is not offline: this
                     // avoids trying to do it while reconciling after responding to an HA failover.
-                    if !matches!(node.availability, NodeAvailability::Offline) {
-                        origin = Some(*node_id);
-                        break;
+                    if let Some(node) = self.get_node(node_id) {
+                        if node.is_available() {
+                            origin = Some(node.clone());
+                            break;
+                        }
                     }
                 }
             }
@@ -186,7 +230,7 @@ impl Reconciler {
 
         // We have an origin and a destination: proceed to do the live migration
         tracing::info!("Live migrating {}->{}", origin, destination);
-        self.live_migrate(origin, destination).await?;
+        self.live_migrate(origin, destination.clone()).await?;
 
         Ok(())
     }
@@ -194,13 +238,8 @@ impl Reconciler {
     async fn get_lsns(
         &self,
         tenant_shard_id: TenantShardId,
-        node_id: &NodeId,
+        node: &Node,
     ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
         let client =
             mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
 
@@ -211,19 +250,27 @@ impl Reconciler {
             .collect())
     }
 
-    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-
-        match client.tenant_secondary_download(tenant_shard_id).await {
-            Ok(()) => {}
-            Err(_) => {
-                tracing::info!("  (skipping, destination wasn't in secondary mode)")
+    async fn secondary_download(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node: &Node,
+    ) -> Result<(), ReconcileError> {
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+                &self.service_config.jwt_token,
+                1,
+                1,
+                Duration::from_secs(60),
+                &self.cancel,
+            )
+            .await
+        {
+            None => Err(ReconcileError::Cancel),
+            Some(Ok(_)) => Ok(()),
+            Some(Err(e)) => {
+                tracing::info!("  (skipping destination download: {})", e);
+                Ok(())
             }
         }
     }
@@ -231,17 +278,14 @@ impl Reconciler {
     async fn await_lsn(
         &self,
         tenant_shard_id: TenantShardId,
-        pageserver_id: &NodeId,
+        node: &Node,
         baseline: HashMap<TimelineId, Lsn>,
     ) -> anyhow::Result<()> {
         loop {
-            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
+            let latest = match self.get_lsns(tenant_shard_id, node).await {
                 Ok(l) => l,
                 Err(e) => {
-                    println!(
-                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
-                        pageserver_id
-                    );
+                    tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
                     std::thread::sleep(Duration::from_millis(500));
                     continue;
                 }
@@ -251,7 +295,7 @@ impl Reconciler {
             for (timeline_id, baseline_lsn) in &baseline {
                 match latest.get(timeline_id) {
                     Some(latest_lsn) => {
-                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
                         if latest_lsn < baseline_lsn {
                             any_behind = true;
                         }
@@ -266,7 +310,7 @@ impl Reconciler {
             }
 
             if !any_behind {
-                println!("✅ LSN caught up.  Proceeding...");
+                tracing::info!("✅ LSN caught up.  Proceeding...");
                 break;
             } else {
                 std::thread::sleep(Duration::from_millis(500));
@@ -278,11 +322,11 @@ impl Reconciler {
 
     pub async fn live_migrate(
         &mut self,
-        origin_ps_id: NodeId,
-        dest_ps_id: NodeId,
-    ) -> anyhow::Result<()> {
+        origin_ps: Node,
+        dest_ps: Node,
+    ) -> Result<(), ReconcileError> {
         // `maybe_live_migrate` is responsibble for sanity of inputs
-        assert!(origin_ps_id != dest_ps_id);
+        assert!(origin_ps.get_id() != dest_ps.get_id());
 
         fn build_location_config(
             shard: &ShardIdentity,
@@ -302,10 +346,7 @@ impl Reconciler {
             }
         }
 
-        tracing::info!(
-            "🔁 Switching origin pageserver {} to stale mode",
-            origin_ps_id
-        );
+        tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",);
 
         // FIXME: it is incorrect to use self.generation here, we should use the generation
         // from the ObservedState of the origin pageserver (it might be older than self.generation)
@@ -316,26 +357,18 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(
-            origin_ps_id,
-            stale_conf,
-            Some(Duration::from_secs(10)),
-            false,
-        )
-        .await?;
+        self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false)
+            .await?;
 
-        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
+        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?);
 
         // If we are migrating to a destination that has a secondary location, warm it up first
-        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
+        if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) {
             if let Some(destination_conf) = &destination_conf.conf {
                 if destination_conf.mode == LocationConfigMode::Secondary {
-                    tracing::info!(
-                        "🔁 Downloading latest layers to destination pageserver {}",
-                        dest_ps_id,
-                    );
-                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
-                        .await;
+                    tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",);
+                    self.secondary_download(self.tenant_shard_id, &dest_ps)
+                        .await?;
                 }
             }
         }
@@ -343,7 +376,7 @@ impl Reconciler {
         // Increment generation before attaching to new pageserver
         self.generation = Some(
             self.persistence
-                .increment_generation(self.tenant_shard_id, dest_ps_id)
+                .increment_generation(self.tenant_shard_id, dest_ps.get_id())
                 .await?,
         );
 
@@ -355,23 +388,23 @@ impl Reconciler {
             None,
         );
 
-        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None, false)
+        tracing::info!("🔁 Attaching to pageserver {dest_ps}");
+        self.location_config(&dest_ps, dest_conf, None, false)
             .await?;
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
-            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
+            self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
                 .await?;
         }
 
-        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
 
         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
         // the origin without notifying compute, we will render the tenant unavailable.
         while let Err(e) = self.compute_notify().await {
             match e {
-                NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
+                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
                 _ => {
                     tracing::warn!(
                         "Live migration blocked by compute notification error, retrying: {e}"
@@ -389,22 +422,19 @@ impl Reconciler {
             None,
             Some(LocationConfigSecondary { warm: true }),
         );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
+        self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false)
             .await?;
         // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
         // partway through.  In fact, all location conf API calls should be in a wrapper that sets
         // the observed state to None, then runs, then sets it to what we wrote.
         self.observed.locations.insert(
-            origin_ps_id,
+            origin_ps.get_id(),
             ObservedStateLocation {
                 conf: Some(origin_secondary_conf),
             },
         );
 
-        println!(
-            "🔁 Switching to AttachedSingle mode on pageserver {}",
-            dest_ps_id
-        );
+        tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
         let dest_final_conf = build_location_config(
             &self.shard,
             &self.config,
@@ -412,16 +442,61 @@ impl Reconciler {
             self.generation,
             None,
         );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
+        self.location_config(&dest_ps, dest_final_conf.clone(), None, false)
             .await?;
         self.observed.locations.insert(
-            dest_ps_id,
+            dest_ps.get_id(),
             ObservedStateLocation {
                 conf: Some(dest_final_conf),
             },
         );
 
-        println!("✅ Migration complete");
+        tracing::info!("✅ Migration complete");
+
+        Ok(())
+    }
+
+    async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
+        // If the attached node has uncertain state, read it from the pageserver before proceeding: this
+        // is important to avoid spurious generation increments.
+        //
+        // We don't need to do this for secondary/detach locations because it's harmless to just PUT their
+        // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate
+        // the `Timeline` object in the pageserver.
+
+        let Some(attached_node) = self.intent.attached.as_ref() else {
+            // Nothing to do
+            return Ok(());
+        };
+
+        if matches!(
+            self.observed.locations.get(&attached_node.get_id()),
+            Some(ObservedStateLocation { conf: None })
+        ) {
+            let tenant_shard_id = self.tenant_shard_id;
+            let observed_conf = match attached_node
+                .with_client_retries(
+                    |client| async move { client.get_location_config(tenant_shard_id).await },
+                    &self.service_config.jwt_token,
+                    1,
+                    1,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(observed)) => observed,
+                Some(Err(e)) => return Err(e.into()),
+                None => return Err(ReconcileError::Cancel),
+            };
+            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
+            self.observed.locations.insert(
+                attached_node.get_id(),
+                ObservedStateLocation {
+                    conf: observed_conf,
+                },
+            );
+        }
 
         Ok(())
     }
@@ -433,14 +508,14 @@ impl Reconciler {
     /// general case reconciliation where we walk through the intent by pageserver
     /// and call out to the pageserver to apply the desired state.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
-        // TODO: if any of self.observed is None, call to remote pageservers
-        // to learn correct state.
+        // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
+        self.maybe_refresh_observed().await?;
 
         // Special case: live migration
         self.maybe_live_migrate().await?;
 
         // If the attached pageserver is not attached, do so now.
-        if let Some(node_id) = self.intent.attached {
+        if let Some(node) = self.intent.attached.as_ref() {
             // If we are in an attached policy, then generation must have been set (null generations
             // are only present when a tenant is initially loaded with a secondary policy)
             debug_assert!(self.generation.is_some());
@@ -451,10 +526,10 @@ impl Reconciler {
             };
 
             let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
-            match self.observed.locations.get(&node_id) {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 observed => {
                     // In all cases other than a matching observed configuration, we will
@@ -492,16 +567,21 @@ impl Reconciler {
                     if increment_generation {
                         let generation = self
                             .persistence
-                            .increment_generation(self.tenant_shard_id, node_id)
+                            .increment_generation(self.tenant_shard_id, node.get_id())
                             .await?;
                         self.generation = Some(generation);
                         wanted_conf.generation = generation.into();
                     }
-                    tracing::info!(%node_id, "Observed configuration requires update.");
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+
+                    // Because `node` comes from a ref to &self, clone it before calling into a &mut self
+                    // function: this could be avoided by refactoring the state mutated by location_config into
+                    // a separate type to Self.
+                    let node = node.clone();
+
                     // Use lazy=true, because we may run many of Self concurrently, and do not want to
                     // overload the pageserver with logical size calculations.
-                    self.location_config(node_id, wanted_conf, None, true)
-                        .await?;
+                    self.location_config(&node, wanted_conf, None, true).await?;
                     self.compute_notify().await?;
                 }
             }
@@ -510,33 +590,27 @@ impl Reconciler {
         // Configure secondary locations: if these were previously attached this
         // implicitly downgrades them from attached to secondary.
         let mut changes = Vec::new();
-        for node_id in &self.intent.secondary {
+        for node in &self.intent.secondary {
             let wanted_conf = secondary_location_conf(&self.shard, &self.config);
-            match self.observed.locations.get(node_id) {
+            match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(%node_id, "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 _ => {
                     // In all cases other than a matching observed configuration, we will
                     // reconcile this location.
-                    tracing::info!(%node_id, "Observed configuration requires update.");
-                    changes.push((*node_id, wanted_conf))
+                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                    changes.push((node.clone(), wanted_conf))
                 }
             }
         }
 
         // Detach any extraneous pageservers that are no longer referenced
         // by our intent.
-        let all_pageservers = self.intent.all_pageservers();
-        for node_id in self.observed.locations.keys() {
-            if all_pageservers.contains(node_id) {
-                // We are only detaching pageservers that aren't used at all.
-                continue;
-            }
-
+        for node in &self.detach {
             changes.push((
-                *node_id,
+                node.clone(),
                 LocationConfig {
                     mode: LocationConfigMode::Detached,
                     generation: None,
@@ -549,11 +623,11 @@ impl Reconciler {
             ));
         }
 
-        for (node_id, conf) in changes {
+        for (node, conf) in changes {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(node_id, conf, None, false).await?;
+            self.location_config(&node, conf, None, false).await?;
         }
 
         Ok(())
@@ -562,12 +636,12 @@ impl Reconciler {
     pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
         // Whenever a particular Reconciler emits a notification, it is always notifying for the intended
         // destination.
-        if let Some(node_id) = self.intent.attached {
+        if let Some(node) = &self.intent.attached {
             let result = self
                 .compute_hook
                 .notify(
                     self.tenant_shard_id,
-                    node_id,
+                    node.get_id(),
                     self.shard.stripe_size,
                     &self.cancel,
                 )
@@ -576,7 +650,7 @@ impl Reconciler {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
+                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 87fce3df25..26a2707e8d 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -43,7 +43,7 @@ impl Scheduler {
         let mut scheduler_nodes = HashMap::new();
         for node in nodes {
             scheduler_nodes.insert(
-                node.id,
+                node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
                     may_schedule: node.may_schedule(),
@@ -68,7 +68,7 @@ impl Scheduler {
         let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
         for node in nodes {
             expect_nodes.insert(
-                node.id,
+                node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
                     may_schedule: node.may_schedule(),
@@ -156,7 +156,7 @@ impl Scheduler {
 
     pub(crate) fn node_upsert(&mut self, node: &Node) {
         use std::collections::hash_map::Entry::*;
-        match self.nodes.entry(node.id) {
+        match self.nodes.entry(node.get_id()) {
             Occupied(mut entry) => {
                 entry.get_mut().may_schedule = node.may_schedule();
             }
@@ -255,7 +255,6 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,18 +263,17 @@ pub(crate) mod test_utils {
     pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
         (1..n + 1)
             .map(|i| {
-                (
-                    NodeId(i),
-                    Node {
-                        id: NodeId(i),
-                        availability: NodeAvailability::Active,
-                        scheduling: NodeSchedulingPolicy::Active,
-                        listen_http_addr: format!("httphost-{i}"),
-                        listen_http_port: 80 + i as u16,
-                        listen_pg_addr: format!("pghost-{i}"),
-                        listen_pg_port: 5432 + i as u16,
-                    },
-                )
+                (NodeId(i), {
+                    let node = Node::new(
+                        NodeId(i),
+                        format!("httphost-{i}"),
+                        80 + i as u16,
+                        format!("pghost-{i}"),
+                        5432 + i as u16,
+                    );
+                    assert!(node.is_available());
+                    node
+                })
             })
             .collect()
     }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index d162ab5c65..f41c4f89b9 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::TenantConfigRequest,
 };
@@ -39,7 +39,6 @@ use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
-    backoff,
     completion::Barrier,
     generation::Generation,
     http::error::ApiError,
@@ -50,7 +49,7 @@ use utils::{
 
 use crate::{
     compute_hook::{self, ComputeHook},
-    node::Node,
+    node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
@@ -201,7 +200,8 @@ impl Service {
     async fn startup_reconcile(self: &Arc<Service>) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed = HashMap::new();
+        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
+            HashMap::new();
 
         let mut nodes_online = HashSet::new();
 
@@ -236,7 +236,8 @@ impl Service {
             nodes_online.insert(node_id);
 
             for (tenant_shard_id, conf_opt) in tenant_shards {
-                observed.insert(tenant_shard_id, (node_id, conf_opt));
+                let shard_observations = observed.entry(tenant_shard_id).or_default();
+                shard_observations.push((node_id, conf_opt));
             }
         }
 
@@ -252,27 +253,28 @@ impl Service {
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
                 if nodes_online.contains(node_id) {
-                    node.availability = NodeAvailability::Active;
+                    node.set_availability(NodeAvailability::Active);
                     scheduler.node_upsert(node);
                 }
             }
             *nodes = Arc::new(new_nodes);
 
-            for (tenant_shard_id, (node_id, observed_loc)) in observed {
-                let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                    cleanup.push((tenant_shard_id, node_id));
-                    continue;
-                };
-
-                tenant_state
-                    .observed
-                    .locations
-                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
+            for (tenant_shard_id, shard_observations) in observed {
+                for (node_id, observed_loc) in shard_observations {
+                    let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
+                        cleanup.push((tenant_shard_id, node_id));
+                        continue;
+                    };
+                    tenant_state
+                        .observed
+                        .locations
+                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
+                }
             }
 
             // Populate each tenant's intent state
             for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-                tenant_state.intent_from_observed();
+                tenant_state.intent_from_observed(scheduler);
                 if let Err(e) = tenant_state.schedule(scheduler) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
@@ -359,40 +361,19 @@ impl Service {
         for node in nodes.values() {
             node_list_futs.push({
                 async move {
-                    let http_client = reqwest::ClientBuilder::new()
-                        .timeout(Duration::from_secs(5))
-                        .build()
-                        .expect("Failed to construct HTTP client");
-                    let client = mgmt_api::Client::from_client(
-                        http_client,
-                        node.base_url(),
-                        self.config.jwt_token.as_deref(),
-                    );
-
-                    fn is_fatal(e: &mgmt_api::Error) -> bool {
-                        use mgmt_api::Error::*;
-                        match e {
-                            ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                            ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                            | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                            | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                            ApiError(_, _) => true,
-                        }
-                    }
-
-                    tracing::info!("Scanning shards on node {}...", node.id);
-                    let description = format!("List locations on {}", node.id);
-                    let response = backoff::retry(
-                        || client.list_location_config(),
-                        is_fatal,
-                        1,
-                        5,
-                        &description,
-                        &self.cancel,
-                    )
-                    .await;
-
-                    (node.id, response)
+                    tracing::info!("Scanning shards on node {node}...");
+                    let timeout = Duration::from_secs(5);
+                    let response = node
+                        .with_client_retries(
+                            |client| async move { client.list_location_config().await },
+                            &self.config.jwt_token,
+                            1,
+                            5,
+                            timeout,
+                            &self.cancel,
+                        )
+                        .await;
+                    (node.get_id(), response)
                 }
             });
         }
@@ -662,19 +643,9 @@ impl Service {
             .list_nodes()
             .await?
             .into_iter()
-            .map(|n| Node {
-                id: NodeId(n.node_id as u64),
-                // At startup we consider a node offline until proven otherwise.
-                availability: NodeAvailability::Offline,
-                scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                    .expect("Bad scheduling policy in DB"),
-                listen_http_addr: n.listen_http_addr,
-                listen_http_port: n.listen_http_port as u16,
-                listen_pg_addr: n.listen_pg_addr,
-                listen_pg_port: n.listen_pg_port as u16,
-            })
+            .map(Node::from_persistent)
             .collect::<Vec<_>>();
-        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
         tracing::info!("Loading shards from database...");
@@ -701,15 +672,13 @@ impl Service {
             }
             for node_id in node_ids {
                 tracing::info!("Creating node {} in scheduler for tests", node_id);
-                let node = Node {
-                    id: NodeId(node_id as u64),
-                    availability: NodeAvailability::Active,
-                    scheduling: NodeSchedulingPolicy::Active,
-                    listen_http_addr: "".to_string(),
-                    listen_http_port: 123,
-                    listen_pg_addr: "".to_string(),
-                    listen_pg_port: 123,
-                };
+                let node = Node::new(
+                    NodeId(node_id as u64),
+                    "".to_string(),
+                    123,
+                    "".to_string(),
+                    123,
+                );
 
                 scheduler.node_upsert(&node);
             }
@@ -975,6 +944,12 @@ impl Service {
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
+        tracing::info!(
+            node_id=%reattach_req.node_id,
+            "Incremented {} tenant shards' generations",
+            incremented_generations.len()
+        );
+
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
 
@@ -987,7 +962,6 @@ impl Service {
                 id: tenant_shard_id,
                 gen: new_gen.into().unwrap(),
             });
-
             // Apply the new generation number to our in-memory state
             let shard_state = locked.tenants.get_mut(&tenant_shard_id);
             let Some(shard_state) = shard_state else {
@@ -1023,6 +997,14 @@ impl Service {
                 if let Some(conf) = observed.conf.as_mut() {
                     conf.generation = new_gen.into();
                 }
+            } else {
+                // This node has no observed state for the shard: perhaps it was offline
+                // when the pageserver restarted.  Insert a None, so that the Reconciler
+                // will be prompted to learn the location's state before it makes changes.
+                shard_state
+                    .observed
+                    .locations
+                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
             }
 
             // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
@@ -1685,7 +1667,7 @@ impl Service {
                         .map_err(|e| {
                             ApiError::InternalServerError(anyhow::anyhow!(
                                 "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
-                                node.id
+                                node
                             ))
                         })?;
             }
@@ -1739,10 +1721,7 @@ impl Service {
             // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
             // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
             // than they had hoped for.
-            tracing::warn!(
-                "Ignoring tenant secondary download error from pageserver {}: {e}",
-                node.id,
-            );
+            tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",);
         }
 
         Ok(())
@@ -1780,13 +1759,11 @@ impl Service {
             // surface immediately as an error to our caller.
             let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
                 ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
-                    node.id
+                    "Error deleting shard {tenant_shard_id} on node {node}: {e}",
                 ))
             })?;
             tracing::info!(
-                "Shard {tenant_shard_id} on node {}, delete returned {}",
-                node.id,
+                "Shard {tenant_shard_id} on node {node}, delete returned {}",
                 status
             );
             if status == StatusCode::ACCEPTED {
@@ -1885,10 +1862,9 @@ impl Service {
             create_req: TimelineCreateRequest,
         ) -> Result<TimelineInfo, ApiError> {
             tracing::info!(
-                "Creating timeline on shard {}/{}, attached to node {}",
+                "Creating timeline on shard {}/{}, attached to node {node}",
                 tenant_shard_id,
                 create_req.new_timeline_id,
-                node.id
             );
             let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
 
@@ -2012,10 +1988,7 @@ impl Service {
             jwt: Option<String>,
         ) -> Result<StatusCode, ApiError> {
             tracing::info!(
-                "Deleting timeline on shard {}/{}, attached to node {}",
-                tenant_shard_id,
-                timeline_id,
-                node.id
+                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
             );
 
             let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
@@ -2024,8 +1997,7 @@ impl Service {
                 .await
                 .map_err(|e| {
                     ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
-                    node.id
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
                 ))
                 })
         }
@@ -2126,14 +2098,7 @@ impl Service {
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while referenced");
 
-            result.push(TenantLocateResponseShard {
-                shard_id: *tenant_shard_id,
-                node_id,
-                listen_http_addr: node.listen_http_addr.clone(),
-                listen_http_port: node.listen_http_port,
-                listen_pg_addr: node.listen_pg_addr.clone(),
-                listen_pg_port: node.listen_pg_port,
-            });
+            result.push(node.shard_location(*tenant_shard_id));
 
             match &shard_params {
                 None => {
@@ -2324,7 +2289,7 @@ impl Service {
                     // populate the correct generation as part of its transaction, to protect us
                     // against racing with changes in the state of the parent.
                     generation: None,
-                    generation_pageserver: Some(target.node.id.0 as i64),
+                    generation_pageserver: Some(target.node.get_id().0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
                     // TODO: get the config out of the map
                     config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2526,10 +2491,10 @@ impl Service {
                 )));
             };
 
-            if node.availability != NodeAvailability::Active {
+            if !node.is_available() {
                 // Warn but proceed: the caller may intend to manually adjust the placement of
                 // a shard even if the node is down, e.g. if intervening during an incident.
-                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+                tracing::warn!("Migrating to unavailable node {node}");
             }
 
             let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2784,11 +2749,7 @@ impl Service {
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 // Note that we do not do a total equality of the struct, because we don't require
                 // the availability/scheduling states to agree for a POST to be idempotent.
-                if node.listen_http_addr == register_req.listen_http_addr
-                    && node.listen_http_port == register_req.listen_http_port
-                    && node.listen_pg_addr == register_req.listen_pg_addr
-                    && node.listen_pg_port == register_req.listen_pg_port
-                {
+                if node.registration_match(&register_req) {
                     tracing::info!(
                         "Node {} re-registered with matching address",
                         register_req.node_id
@@ -2812,16 +2773,14 @@ impl Service {
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.
-        let new_node = Node {
-            id: register_req.node_id,
-            listen_http_addr: register_req.listen_http_addr,
-            listen_http_port: register_req.listen_http_port,
-            listen_pg_addr: register_req.listen_pg_addr,
-            listen_pg_port: register_req.listen_pg_port,
-            scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
-            availability: NodeAvailability::Active,
-        };
+        let new_node = Node::new(
+            register_req.node_id,
+            register_req.listen_http_addr,
+            register_req.listen_http_port,
+            register_req.listen_pg_addr,
+            register_req.listen_pg_port,
+        );
+
         // TODO: idempotency if the node already exists in the database
         self.persistence.insert_node(&new_node).await?;
 
@@ -2866,29 +2825,14 @@ impl Service {
             ));
         };
 
-        let mut offline_transition = false;
-        let mut active_transition = false;
-
-        if let Some(availability) = &config_req.availability {
-            match (availability, &node.availability) {
-                (NodeAvailability::Offline, NodeAvailability::Active) => {
-                    tracing::info!("Node {} transition to offline", config_req.node_id);
-                    offline_transition = true;
-                }
-                (NodeAvailability::Active, NodeAvailability::Offline) => {
-                    tracing::info!("Node {} transition to active", config_req.node_id);
-                    active_transition = true;
-                }
-                _ => {
-                    tracing::info!("Node {} no change during config", config_req.node_id);
-                    // No change
-                }
-            };
-            node.availability = *availability;
-        }
+        let availability_transition = if let Some(availability) = &config_req.availability {
+            node.set_availability(*availability)
+        } else {
+            AvailabilityTransition::Unchanged
+        };
 
         if let Some(scheduling) = config_req.scheduling {
-            node.scheduling = scheduling;
+            node.set_scheduling(scheduling);
 
             // TODO: once we have a background scheduling ticker for fill/drain, kick it
             // to wake up and start working.
@@ -2899,74 +2843,80 @@ impl Service {
 
         let new_nodes = Arc::new(new_nodes);
 
-        if offline_transition {
-            let mut tenants_affected: usize = 0;
-            for (tenant_shard_id, tenant_state) in tenants {
-                if let Some(observed_loc) =
-                    tenant_state.observed.locations.get_mut(&config_req.node_id)
-                {
-                    // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
-                    // not assume our knowledge of the node's configuration is accurate until it comes back online
-                    observed_loc.conf = None;
-                }
+        match availability_transition {
+            AvailabilityTransition::ToOffline => {
+                tracing::info!("Node {} transition to offline", config_req.node_id);
+                let mut tenants_affected: usize = 0;
+                for (tenant_shard_id, tenant_state) in tenants {
+                    if let Some(observed_loc) =
+                        tenant_state.observed.locations.get_mut(&config_req.node_id)
+                    {
+                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
+                        // not assume our knowledge of the node's configuration is accurate until it comes back online
+                        observed_loc.conf = None;
+                    }
 
-                if tenant_state.intent.demote_attached(config_req.node_id) {
-                    tenant_state.sequence = tenant_state.sequence.next();
-                    match tenant_state.schedule(scheduler) {
-                        Err(e) => {
-                            // It is possible that some tenants will become unschedulable when too many pageservers
-                            // go offline: in this case there isn't much we can do other than make the issue observable.
-                            // TODO: give TenantState a scheduling error attribute to be queried later.
-                            tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
-                        }
-                        Ok(()) => {
-                            if tenant_state
-                                .maybe_reconcile(
-                                    result_tx.clone(),
-                                    &new_nodes,
-                                    &compute_hook,
-                                    &self.config,
-                                    &self.persistence,
-                                    &self.gate,
-                                    &self.cancel,
-                                )
-                                .is_some()
-                            {
-                                tenants_affected += 1;
-                            };
+                    if tenant_state.intent.demote_attached(config_req.node_id) {
+                        tenant_state.sequence = tenant_state.sequence.next();
+                        match tenant_state.schedule(scheduler) {
+                            Err(e) => {
+                                // It is possible that some tenants will become unschedulable when too many pageservers
+                                // go offline: in this case there isn't much we can do other than make the issue observable.
+                                // TODO: give TenantState a scheduling error attribute to be queried later.
+                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
+                            }
+                            Ok(()) => {
+                                if tenant_state
+                                    .maybe_reconcile(
+                                        result_tx.clone(),
+                                        &new_nodes,
+                                        &compute_hook,
+                                        &self.config,
+                                        &self.persistence,
+                                        &self.gate,
+                                        &self.cancel,
+                                    )
+                                    .is_some()
+                                {
+                                    tenants_affected += 1;
+                                };
+                            }
                         }
                     }
                 }
+                tracing::info!(
+                    "Launched {} reconciler tasks for tenants affected by node {} going offline",
+                    tenants_affected,
+                    config_req.node_id
+                )
             }
-            tracing::info!(
-                "Launched {} reconciler tasks for tenants affected by node {} going offline",
-                tenants_affected,
-                config_req.node_id
-            )
-        }
-
-        if active_transition {
-            // When a node comes back online, we must reconcile any tenant that has a None observed
-            // location on the node.
-            for tenant_state in locked.tenants.values_mut() {
-                if let Some(observed_loc) =
-                    tenant_state.observed.locations.get_mut(&config_req.node_id)
-                {
-                    if observed_loc.conf.is_none() {
-                        tenant_state.maybe_reconcile(
-                            result_tx.clone(),
-                            &new_nodes,
-                            &compute_hook,
-                            &self.config,
-                            &self.persistence,
-                            &self.gate,
-                            &self.cancel,
-                        );
+            AvailabilityTransition::ToActive => {
+                tracing::info!("Node {} transition to active", config_req.node_id);
+                // When a node comes back online, we must reconcile any tenant that has a None observed
+                // location on the node.
+                for tenant_state in locked.tenants.values_mut() {
+                    if let Some(observed_loc) =
+                        tenant_state.observed.locations.get_mut(&config_req.node_id)
+                    {
+                        if observed_loc.conf.is_none() {
+                            tenant_state.maybe_reconcile(
+                                result_tx.clone(),
+                                &new_nodes,
+                                &compute_hook,
+                                &self.config,
+                                &self.persistence,
+                                &self.gate,
+                                &self.cancel,
+                            );
+                        }
                     }
                 }
-            }
 
-            // TODO: in the background, we should balance work back onto this pageserver
+                // TODO: in the background, we should balance work back onto this pageserver
+            }
+            AvailabilityTransition::Unchanged => {
+                tracing::info!("Node {} no change during config", config_req.node_id);
+            }
         }
 
         locked.nodes = new_nodes;
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 33b7d578c7..ddb9866527 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,7 +1,10 @@
-use std::{collections::HashMap, sync::Arc, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+    time::Duration,
+};
 
 use crate::{metrics, persistence::TenantShardPersistence};
-use pageserver_api::controller_api::NodeAvailability;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -370,7 +373,7 @@ impl TenantState {
     /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
     /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
     /// in a way that makes use of any configured locations that already exist in the outside world.
-    pub(crate) fn intent_from_observed(&mut self) {
+    pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) {
         // Choose an attached location by filtering observed locations, and then sorting to get the highest
         // generation
         let mut attached_locs = self
@@ -395,7 +398,7 @@ impl TenantState {
 
         attached_locs.sort_by_key(|i| i.1);
         if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
-            self.intent.attached = Some(*node_id);
+            self.intent.set_attached(scheduler, Some(*node_id));
         }
 
         // All remaining observed locations generate secondary intents.  This includes None
@@ -406,7 +409,7 @@ impl TenantState {
         // will take care of promoting one of these secondaries to be attached.
         self.observed.locations.keys().for_each(|node_id| {
             if Some(*node_id) != self.intent.attached {
-                self.intent.secondary.push(*node_id);
+                self.intent.push_secondary(scheduler, *node_id);
             }
         });
     }
@@ -564,7 +567,9 @@ impl TenantState {
         }
     }
 
-    fn dirty(&self) -> bool {
+    fn dirty(&self, nodes: &Arc<HashMap<NodeId, Node>>) -> bool {
+        let mut dirty_nodes = HashSet::new();
+
         if let Some(node_id) = self.intent.attached {
             // Maybe panic: it is a severe bug if we try to attach while generation is null.
             let generation = self
@@ -575,7 +580,7 @@ impl TenantState {
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
-                    return true;
+                    dirty_nodes.insert(node_id);
                 }
             }
         }
@@ -585,7 +590,7 @@ impl TenantState {
             match self.observed.locations.get(node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
-                    return true;
+                    dirty_nodes.insert(*node_id);
                 }
             }
         }
@@ -593,17 +598,18 @@ impl TenantState {
         for node_id in self.observed.locations.keys() {
             if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
                 // We have observed state that isn't part of our intent: need to clean it up.
-                return true;
+                dirty_nodes.insert(*node_id);
             }
         }
 
-        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
-        // wake up a reconciler to send it.
-        if self.pending_compute_notification {
-            return true;
-        }
+        dirty_nodes.retain(|node_id| {
+            nodes
+                .get(node_id)
+                .map(|n| n.is_available())
+                .unwrap_or(false)
+        });
 
-        false
+        !dirty_nodes.is_empty()
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -625,15 +631,20 @@ impl TenantState {
             let node = pageservers
                 .get(node_id)
                 .expect("Nodes may not be removed while referenced");
-            if observed_loc.conf.is_none()
-                && !matches!(node.availability, NodeAvailability::Offline)
-            {
+            if observed_loc.conf.is_none() && node.is_available() {
                 dirty_observed = true;
                 break;
             }
         }
 
-        if !self.dirty() && !dirty_observed {
+        let active_nodes_dirty = self.dirty(pageservers);
+
+        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
+        // wake up a reconciler to send it.
+        let do_reconcile =
+            active_nodes_dirty || dirty_observed || self.pending_compute_notification;
+
+        if !do_reconcile {
             tracing::info!("Not dirty, no reconciliation needed.");
             return None;
         }
@@ -663,6 +674,21 @@ impl TenantState {
             }
         }
 
+        // Build list of nodes from which the reconciler should detach
+        let mut detach = Vec::new();
+        for node_id in self.observed.locations.keys() {
+            if self.intent.get_attached() != &Some(*node_id)
+                && !self.intent.secondary.contains(node_id)
+            {
+                detach.push(
+                    pageservers
+                        .get(node_id)
+                        .expect("Intent references non-existent pageserver")
+                        .clone(),
+                )
+            }
+        }
+
         // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
         // doing our sequence's work.
         let old_handle = self.reconciler.take();
@@ -677,14 +703,15 @@ impl TenantState {
         self.sequence = self.sequence.next();
 
         let reconciler_cancel = cancel.child_token();
+        let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
             generation: self.generation,
-            intent: TargetState::from_intent(&self.intent),
+            intent: reconciler_intent,
+            detach,
             config: self.config.clone(),
             observed: self.observed.clone(),
-            pageservers: pageservers.clone(),
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
@@ -819,7 +846,10 @@ impl TenantState {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::{
+        controller_api::NodeAvailability,
+        shard::{ShardCount, ShardNumber},
+    };
     use utils::id::TenantId;
 
     use crate::scheduler::test_utils::make_test_nodes;
@@ -878,7 +908,10 @@ pub(crate) mod tests {
         assert_eq!(tenant_state.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
-        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
+        nodes
+            .get_mut(&attached_node_id)
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
         scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
 
         // Scheduling the node should promote the still-available secondary node to attached
@@ -897,4 +930,54 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn intent_from_observed() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+
+        tenant_state.observed.locations.insert(
+            NodeId(3),
+            ObservedStateLocation {
+                conf: Some(LocationConfig {
+                    mode: LocationConfigMode::AttachedMulti,
+                    generation: Some(2),
+                    secondary_conf: None,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    tenant_conf: TenantConfig::default(),
+                }),
+            },
+        );
+
+        tenant_state.observed.locations.insert(
+            NodeId(2),
+            ObservedStateLocation {
+                conf: Some(LocationConfig {
+                    mode: LocationConfigMode::AttachedStale,
+                    generation: Some(1),
+                    secondary_conf: None,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    tenant_conf: TenantConfig::default(),
+                }),
+            },
+        );
+
+        tenant_state.intent_from_observed(&mut scheduler);
+
+        // The highest generationed attached location gets used as attached
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
+        // Other locations get used as secondary
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
+
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
+
+        tenant_state.intent.clear(&mut scheduler);
+        Ok(())
+    }
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4dde7bdf0b..732eb951c9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -7,7 +7,7 @@ use utils::{
 
 pub mod util;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct Client {
     mgmt_api_endpoint: String,
     authorization_header: Option<String>,
@@ -24,6 +24,9 @@ pub enum Error {
 
     #[error("pageserver API: {1}")]
     ApiError(StatusCode, String),
+
+    #[error("Cancelled")]
+    Cancelled,
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -287,6 +290,21 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn get_location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<LocationConfig>> {
+        let path = format!(
+            "{}/v1/location_config/{tenant_shard_id}",
+            self.mgmt_api_endpoint
+        );
+        self.request(Method::GET, &path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_create(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6aaf1ab27e..eafad9ab73 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
@@ -1519,6 +1520,29 @@ async fn list_location_config_handler(
     json_response(StatusCode::OK, result)
 }
 
+async fn get_location_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let slot = state.tenant_manager.get(tenant_shard_id);
+
+    let Some(slot) = slot else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Tenant shard not found").into(),
+        ));
+    };
+
+    let result: Option<LocationConfig> = match slot {
+        TenantSlot::Attached(t) => Some(t.get_location_conf()),
+        TenantSlot::Secondary(s) => Some(s.get_location_conf()),
+        TenantSlot::InProgress(_) => None,
+    };
+
+    json_response(StatusCode::OK, result)
+}
+
 // Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
 // (from all pageservers) as it invalidates consistency assumptions.
 async fn tenant_time_travel_remote_storage_handler(
@@ -2223,6 +2247,9 @@ pub fn make_router(
         .get("/v1/location_config", |r| {
             api_handler(r, list_location_config_handler)
         })
+        .get("/v1/location_config/:tenant_id", |r| {
+            api_handler(r, get_location_config_handler)
+        })
         .put(
             "/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
             |r| api_handler(r, tenant_time_travel_remote_storage_handler),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 06b61d4631..fc08b3c82e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1358,6 +1358,16 @@ impl TenantManager {
         }
     }
 
+    pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option<TenantSlot> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
+                map.get(&tenant_shard_id).cloned()
+            }
+        }
+    }
+
     pub(crate) async fn delete_tenant(
         &self,
         tenant_shard_id: TenantShardId,

From ce7a82db058cecdba996a210b5afea8451bfbc4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 7 Mar 2024 18:32:09 +0100
Subject: [PATCH 42/52] Update svg_fmt (#7049)

Gets upstream PR https://github.com/nical/rust_debug/pull/3 , removes
trailing "s from output.
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 167a2b2179..5c48942d41 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5525,9 +5525,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
 
 [[package]]
 name = "syn"

From 2fc89428c33508bee9fa5772c0c5c35ba3e38548 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 7 Mar 2024 09:12:06 -0900
Subject: [PATCH 43/52] Hopefully stabilize test_bad_connection.py (#6976)

## Problem
It seems that even though we have a retry on basebackup, it still
sometimes fails to fetch it with the failpoint enabled, resulting in a
test error.

## Summary of changes
If we fail to get the basebackup, disable the failpoint and try again.
---
 compute_tools/src/compute.rs  | 8 ++++----
 control_plane/src/endpoint.rs | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5613e6c868..96ab4a06a5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -396,9 +396,9 @@ impl ComputeNode {
     // Gets the basebackup in a retry loop
     #[instrument(skip_all, fields(%lsn))]
     pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
-        let mut retry_period_ms = 500;
+        let mut retry_period_ms = 500.0;
         let mut attempts = 0;
-        let max_attempts = 5;
+        let max_attempts = 10;
         loop {
             let result = self.try_get_basebackup(compute_state, lsn);
             match result {
@@ -410,8 +410,8 @@ impl ComputeNode {
                         "Failed to get basebackup: {} (attempt {}/{})",
                         e, attempts, max_attempts
                     );
-                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
-                    retry_period_ms *= 2;
+                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
+                    retry_period_ms *= 1.5;
                 }
                 Err(_) => {
                     return result;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 10e4c5d69f..ac0a8417ae 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -656,7 +656,7 @@ impl Endpoint {
         // Wait for it to start
         let mut attempt = 0;
         const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
+        const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
         loop {
             attempt += 1;
             match self.get_status().await {

From 02358b21a41311be2ee610bd461093a68b14222e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Mar 2024 18:23:19 +0000
Subject: [PATCH 44/52] update rustls (#7048)

## Summary of changes

Update rustls from 0.21 to 0.22.

reqwest/tonic/aws-smithy still use rustls 0.21. no upgrade route
available yet.
---
 Cargo.lock                                   | 293 +++++++++++++------
 Cargo.toml                                   |  10 +-
 libs/postgres_backend/tests/simple_select.rs |  19 +-
 proxy/src/bin/pg_sni_router.rs               |  38 +--
 proxy/src/config.rs                          |  54 ++--
 proxy/src/proxy/tests.rs                     |  19 +-
 workspace_hack/Cargo.toml                    |   2 +-
 7 files changed, 281 insertions(+), 154 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5c48942d41..7fd9053f62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -626,7 +626,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls",
+ "rustls 0.21.9",
  "tokio",
  "tracing",
 ]
@@ -907,6 +907,16 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bcder"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0"
+dependencies = [
+ "bytes",
+ "smallvec",
+]
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -935,7 +945,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.32",
+ "syn 2.0.52",
  "which",
 ]
 
@@ -986,9 +996,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 dependencies = [
  "serde",
 ]
@@ -1149,7 +1159,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1574,7 +1584,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1585,7 +1595,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1627,6 +1637,16 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "der"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
+dependencies = [
+ "const-oid",
+ "zeroize",
+]
+
 [[package]]
 name = "der-parser"
 version = "8.2.0"
@@ -1681,7 +1701,7 @@ dependencies = [
  "diesel_table_macro_syntax",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1701,7 +1721,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
 dependencies = [
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1723,7 +1743,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -1747,10 +1767,10 @@ version = "0.14.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
- "der",
+ "der 0.6.1",
  "elliptic-curve",
  "rfc6979",
- "signature",
+ "signature 1.6.4",
 ]
 
 [[package]]
@@ -1767,7 +1787,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
  "base16ct",
  "crypto-bigint 0.4.9",
- "der",
+ "der 0.6.1",
  "digest",
  "ff",
  "generic-array",
@@ -1827,7 +1847,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2087,7 +2107,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2470,10 +2490,10 @@ dependencies = [
  "http 0.2.9",
  "hyper",
  "log",
- "rustls",
+ "rustls 0.21.9",
  "rustls-native-certs",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
 ]
 
 [[package]]
@@ -2711,7 +2731,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
  "base64 0.21.1",
  "js-sys",
- "pem 3.0.3",
+ "pem",
  "ring 0.17.6",
  "serde",
  "serde_json",
@@ -3234,7 +3254,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3716,7 +3736,7 @@ dependencies = [
  "parquet",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3754,16 +3774,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
-[[package]]
-name = "pem"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
-dependencies = [
- "base64 0.21.1",
- "serde",
-]
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -3825,7 +3835,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -3846,8 +3856,8 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba"
 dependencies = [
- "der",
- "spki",
+ "der 0.6.1",
+ "spki 0.6.0",
 ]
 
 [[package]]
@@ -3946,14 +3956,14 @@ dependencies = [
  "futures",
  "once_cell",
  "pq_proto",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.22.2",
+ "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tracing",
  "workspace_hack",
 ]
@@ -4042,7 +4052,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -4053,9 +4063,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
@@ -4202,8 +4212,8 @@ dependencies = [
  "routerify",
  "rstest",
  "rustc-hash",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.22.2",
+ "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4219,7 +4229,7 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tokio-util",
  "tracing",
  "tracing-opentelemetry",
@@ -4247,9 +4257,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.32"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
@@ -4370,12 +4380,12 @@ dependencies = [
 
 [[package]]
 name = "rcgen"
-version = "0.11.1"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976"
+checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
- "pem 2.0.1",
- "ring 0.16.20",
+ "pem",
+ "ring 0.17.6",
  "time",
  "yasna",
 ]
@@ -4393,15 +4403,15 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
+ "rustls 0.21.9",
  "rustls-native-certs",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "rustls-webpki 0.101.7",
  "ryu",
  "sha1_smol",
  "socket2 0.4.9",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "url",
 ]
@@ -4547,14 +4557,14 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.21.9",
+ "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
@@ -4720,7 +4730,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.32",
+ "syn 2.0.52",
  "unicode-ident",
 ]
 
@@ -4804,6 +4814,20 @@ dependencies = [
  "sct",
 ]
 
+[[package]]
+name = "rustls"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+dependencies = [
+ "log",
+ "ring 0.17.6",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rustls-native-certs"
 version = "0.6.2"
@@ -4811,7 +4835,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50"
 dependencies = [
  "openssl-probe",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "schannel",
  "security-framework",
 ]
@@ -4825,6 +4849,22 @@ dependencies = [
  "base64 0.21.1",
 ]
 
+[[package]]
+name = "rustls-pemfile"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
+dependencies = [
+ "base64 0.21.1",
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+
 [[package]]
 name = "rustls-webpki"
 version = "0.100.2"
@@ -4845,6 +4885,17 @@ dependencies = [
  "untrusted 0.9.0",
 ]
 
+[[package]]
+name = "rustls-webpki"
+version = "0.102.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+dependencies = [
+ "ring 0.17.6",
+ "rustls-pki-types",
+ "untrusted 0.9.0",
+]
+
 [[package]]
 name = "rustversion"
 version = "1.0.12"
@@ -4887,7 +4938,7 @@ dependencies = [
  "serde_with",
  "thiserror",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
  "tokio-stream",
  "tracing",
  "tracing-appender",
@@ -5022,7 +5073,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
  "base16ct",
- "der",
+ "der 0.6.1",
  "generic-array",
  "pkcs8",
  "subtle",
@@ -5066,7 +5117,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
  "httpdate",
  "reqwest",
- "rustls",
+ "rustls 0.21.9",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5188,7 +5239,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5269,7 +5320,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5355,6 +5406,15 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
 [[package]]
 name = "simple_asn1"
 version = "0.6.2"
@@ -5439,7 +5499,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b"
 dependencies = [
  "base64ct",
- "der",
+ "der 0.6.1",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der 0.7.8",
 ]
 
 [[package]]
@@ -5542,9 +5612,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.32"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5659,22 +5729,22 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.47"
+version = "1.0.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
+checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.47"
+version = "1.0.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
+checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5845,7 +5915,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5883,16 +5953,17 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f"
+checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
- "ring 0.16.20",
- "rustls",
+ "ring 0.17.6",
+ "rustls 0.22.2",
  "tokio",
  "tokio-postgres",
- "tokio-rustls",
+ "tokio-rustls 0.25.0",
+ "x509-certificate",
 ]
 
 [[package]]
@@ -5901,7 +5972,18 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls",
+ "rustls 0.21.9",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
+dependencies = [
+ "rustls 0.22.2",
+ "rustls-pki-types",
  "tokio",
 ]
 
@@ -6016,9 +6098,9 @@ dependencies = [
  "pin-project",
  "prost",
  "rustls-native-certs",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.2",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-stream",
  "tower",
  "tower-layer",
@@ -6114,7 +6196,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -6330,7 +6412,7 @@ dependencies = [
  "base64 0.21.1",
  "log",
  "once_cell",
- "rustls",
+ "rustls 0.21.9",
  "rustls-webpki 0.100.2",
  "url",
  "webpki-roots 0.23.1",
@@ -6572,7 +6654,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
  "wasm-bindgen-shared",
 ]
 
@@ -6606,7 +6688,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -6939,19 +7021,18 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest",
- "ring 0.16.20",
- "rustls",
+ "rustls 0.21.9",
  "scopeguard",
  "serde",
  "serde_json",
  "smallvec",
  "subtle",
  "syn 1.0.109",
- "syn 2.0.32",
+ "syn 2.0.52",
  "time",
  "time-macros",
  "tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "toml_datetime",
  "toml_edit",
@@ -6962,11 +7043,31 @@ dependencies = [
  "tungstenite",
  "url",
  "uuid",
+ "zeroize",
  "zstd",
  "zstd-safe",
  "zstd-sys",
 ]
 
+[[package]]
+name = "x509-certificate"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85"
+dependencies = [
+ "bcder",
+ "bytes",
+ "chrono",
+ "der 0.7.8",
+ "hex",
+ "pem",
+ "ring 0.17.6",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "thiserror",
+ "zeroize",
+]
+
 [[package]]
 name = "x509-parser"
 version = "0.15.0"
@@ -7025,7 +7126,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.32",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -7033,6 +7134,20 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+dependencies = [
+ "zeroize_derive",
+]
+
+[[package]]
+name = "zeroize_derive"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
 
 [[package]]
 name = "zstd"
diff --git a/Cargo.toml b/Cargo.toml
index 42deaac19b..76f4ff041c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,8 +129,8 @@ reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.21"
-rustls-pemfile = "1"
+rustls = "0.22"
+rustls-pemfile = "2"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -159,8 +159,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.10.0"
-tokio-rustls = "0.24"
+tokio-postgres-rustls = "0.11.0"
+tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -219,7 +219,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.11"
+rcgen = "0.12"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.9"
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index e046fa5260..80df9db858 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -72,14 +72,19 @@ async fn simple_select() {
     }
 }
 
-static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("key.pem"));
-    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+    let key = rustls_pemfile::rsa_private_keys(&mut cursor)
+        .next()
+        .unwrap()
+        .unwrap();
+    rustls::pki_types::PrivateKeyDer::Pkcs1(key)
 });
 
-static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+    let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
+    cert
 });
 
 // test that basic select with ssl works
@@ -88,9 +93,8 @@ async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
     let server_cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
         .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
+        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
         .unwrap();
     let tls_config = Some(Arc::new(server_cfg));
     let pgbackend =
@@ -102,10 +106,9 @@ async fn simple_select_ssl() {
     });
 
     let client_cfg = rustls::ClientConfig::builder()
-        .with_safe_defaults()
         .with_root_certificates({
             let mut store = rustls::RootCertStore::empty();
-            store.add(&CERT).unwrap();
+            store.add(CERT.clone()).unwrap();
             store
         })
         .with_no_client_auth();
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d5ab66d6aa..385f7820cb 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,6 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
+use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
@@ -76,37 +77,40 @@ async fn main() -> anyhow::Result<()> {
         (Some(key_path), Some(cert_path)) => {
             let key = {
                 let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-                let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-                    .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+                let mut keys =
+                    rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
 
                 ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-                keys.pop().map(rustls::PrivateKey).unwrap()
+                PrivateKeyDer::Pkcs8(
+                    keys.pop()
+                        .unwrap()
+                        .context(format!("Failed to read TLS keys at '{key_path}'"))?,
+                )
             };
 
             let cert_chain_bytes = std::fs::read(cert_path)
                 .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
 
-            let cert_chain = {
+            let cert_chain: Vec<_> = {
                 rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                    .context(format!(
-                        "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                    ))?
-                    .into_iter()
-                    .map(rustls::Certificate)
-                    .collect_vec()
+                .try_collect()
+                .with_context(|| {
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
+                })?
             };
 
             // needed for channel bindings
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder()
-                .with_safe_default_cipher_suites()
-                .with_safe_default_kx_groups()
-                .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
-                .with_no_client_auth()
-                .with_single_cert(cert_chain, key)?
-                .into();
+            let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[
+                &rustls::version::TLS13,
+                &rustls::version::TLS12,
+            ])
+            .with_no_client_auth()
+            .with_single_cert(cert_chain, key)?
+            .into();
 
             (tls_config, tls_server_end_point)
         }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 9f276c3c24..437ec9f401 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,10 @@
 use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
-use rustls::{sign, Certificate, PrivateKey};
+use itertools::Itertools;
+use rustls::{
+    crypto::ring::sign,
+    pki_types::{CertificateDer, PrivateKeyDer},
+};
 use sha2::{Digest, Sha256};
 use std::{
     collections::{HashMap, HashSet},
@@ -88,14 +92,14 @@ pub fn configure_tls(
 
     let cert_resolver = Arc::new(cert_resolver);
 
-    let config = rustls::ServerConfig::builder()
-        .with_safe_default_cipher_suites()
-        .with_safe_default_kx_groups()
-        // allow TLS 1.2 to be compatible with older client libraries
-        .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
-        .with_no_client_auth()
-        .with_cert_resolver(cert_resolver.clone())
-        .into();
+    // allow TLS 1.2 to be compatible with older client libraries
+    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+        &rustls::version::TLS13,
+        &rustls::version::TLS12,
+    ])
+    .with_no_client_auth()
+    .with_cert_resolver(cert_resolver.clone())
+    .into();
 
     Ok(TlsConfig {
         config,
@@ -133,14 +137,14 @@ pub enum TlsServerEndPoint {
 }
 
 impl TlsServerEndPoint {
-    pub fn new(cert: &Certificate) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
         let sha256_oids = [
             // I'm explicitly not adding MD5 or SHA1 here... They're bad.
             oid_registry::OID_SIG_ECDSA_WITH_SHA256,
             oid_registry::OID_PKCS1_SHA256WITHRSA,
         ];
 
-        let pem = x509_parser::parse_x509_certificate(&cert.0)
+        let pem = x509_parser::parse_x509_certificate(cert)
             .context("Failed to parse PEM object from cerficiate")?
             .1;
 
@@ -150,8 +154,7 @@ impl TlsServerEndPoint {
         let oid = pem.signature_algorithm.oid();
         let alg = reg.get(oid);
         if sha256_oids.contains(oid) {
-            let tls_server_end_point: [u8; 32] =
-                Sha256::new().chain_update(&cert.0).finalize().into();
+            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
             info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
             Ok(Self::Sha256(tls_server_end_point))
         } else {
@@ -165,7 +168,7 @@ impl TlsServerEndPoint {
     }
 }
 
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct CertResolver {
     certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
     default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
@@ -185,11 +188,14 @@ impl CertResolver {
         let priv_key = {
             let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec();
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-            keys.pop().map(rustls::PrivateKey).unwrap()
+            PrivateKeyDer::Pkcs8(
+                keys.pop()
+                    .unwrap()
+                    .context(format!("Failed to parse TLS keys at '{key_path}'"))?,
+            )
         };
 
         let cert_chain_bytes = std::fs::read(cert_path)
@@ -197,14 +203,10 @@ impl CertResolver {
 
         let cert_chain = {
             rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                .try_collect()
                 .with_context(|| {
-                    format!(
-                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                )
+                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
                 })?
-                .into_iter()
-                .map(rustls::Certificate)
-                .collect()
         };
 
         self.add_cert(priv_key, cert_chain, is_default)
@@ -212,15 +214,15 @@ impl CertResolver {
 
     pub fn add_cert(
         &mut self,
-        priv_key: PrivateKey,
-        cert_chain: Vec<Certificate>,
+        priv_key: PrivateKeyDer<'static>,
+        cert_chain: Vec<CertificateDer<'static>>,
         is_default: bool,
     ) -> anyhow::Result<()> {
         let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
 
         let first_cert = &cert_chain[0];
         let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-        let pem = x509_parser::parse_x509_certificate(&first_cert.0)
+        let pem = x509_parser::parse_x509_certificate(first_cert)
             .context("Failed to parse PEM object from cerficiate")?
             .1;
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index d866b1820f..5d0340e852 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -20,6 +20,7 @@ use crate::{http, sasl, scram};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
+use rustls::pki_types;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
@@ -28,7 +29,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
 fn generate_certs(
     hostname: &str,
     common_name: &str,
-) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> {
+) -> anyhow::Result<(
+    pki_types::CertificateDer<'static>,
+    pki_types::CertificateDer<'static>,
+    pki_types::PrivateKeyDer<'static>,
+)> {
     let ca = rcgen::Certificate::from_params({
         let mut params = rcgen::CertificateParams::default();
         params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
@@ -45,9 +50,9 @@ fn generate_certs(
     })?;
 
     Ok((
-        rustls::Certificate(ca.serialize_der()?),
-        rustls::Certificate(cert.serialize_der_with_signer(&ca)?),
-        rustls::PrivateKey(cert.serialize_private_key_der()),
+        pki_types::CertificateDer::from(ca.serialize_der()?),
+        pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?),
+        pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()),
     ))
 }
 
@@ -82,9 +87,8 @@ fn generate_tls_config<'a>(
 
     let tls_config = {
         let config = rustls::ServerConfig::builder()
-            .with_safe_defaults()
             .with_no_client_auth()
-            .with_single_cert(vec![cert.clone()], key.clone())?
+            .with_single_cert(vec![cert.clone()], key.clone_key())?
             .into();
 
         let mut cert_resolver = CertResolver::new();
@@ -101,10 +105,9 @@ fn generate_tls_config<'a>(
 
     let client_config = {
         let config = rustls::ClientConfig::builder()
-            .with_safe_defaults()
             .with_root_certificates({
                 let mut store = rustls::RootCertStore::empty();
-                store.add(&ca)?;
+                store.add(ca)?;
                 store
             })
             .with_no_client_auth();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e808fabbe7..8593b752c2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,6 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
-ring = { version = "0.16" }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
@@ -80,6 +79,7 @@ tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
+zeroize = { version = "1", features = ["derive"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 0f05ef67e28fc0c26e0b1300edad82d4e054e24f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 7 Mar 2024 19:53:10 +0000
Subject: [PATCH 45/52] pageserver: revert open layer rolling revert (#6962)

## Problem
We reverted https://github.com/neondatabase/neon/pull/6661 a few days
ago. The change led to OOMs in
benchmarks followed by large WAL reingests.

The issue was that we removed [this
code](https://github.com/neondatabase/neon/blob/d04af08567cc3ff94ff19a2f6b3f7a2a1e3c55d1/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs#L409-L417).
That call may trigger a roll of the open layer due to
the keepalive messages received from the safekeeper. Removing it meant
that enforcing
of checkpoint timeout became even more lax and led to using up large
amounts of memory
for the in memory layer indices.

## Summary of changes
Piggyback on keep alive messages to enforce checkpoint timeout. This is
a hack, but it's exactly what
the current code is doing.

## Alternatives
Christhian, Joonas and myself sketched out a timer based approach
[here](https://github.com/neondatabase/neon/pull/6940). While discussing
it further, it became obvious that's also a bit of a hack and not the
desired end state. I chose not
to take that further since it's not what we ultimately want and it'll be
harder to rip out.

Right now it's unclear what the ideal system behaviour is:
* early flushing on memory pressure, or ...
* detaching tenants on memory pressure
---
 pageserver/src/pgdatadir_mapping.rs           |  17 +-
 pageserver/src/tenant.rs                      |  36 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  38 +-
 pageserver/src/tenant/timeline.rs             | 375 +++++++++++++-----
 .../walreceiver/walreceiver_connection.rs     |  36 +-
 test_runner/performance/test_layer_map.py     |   4 +-
 6 files changed, 322 insertions(+), 184 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 628aeb5a28..727650a5a5 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
     rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1498,7 +1499,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1537,14 +1538,22 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+        let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
-            self.pending_updates.clear();
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
+                .pending_updates
+                .drain()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();
+
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2f23e535fa..4f4654422b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3857,7 +3857,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3869,7 +3869,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -3935,7 +3935,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -3969,7 +3969,7 @@ mod tests {
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
-        let new_writer = newtline.writer().await;
+        let mut new_writer = newtline.writer().await;
         new_writer
             .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
             .await?;
@@ -4001,7 +4001,7 @@ mod tests {
     ) -> anyhow::Result<()> {
         let mut lsn = start_lsn;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             // Create a relation on the timeline
             writer
                 .put(
@@ -4026,7 +4026,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     *TEST_KEY,
@@ -4389,7 +4389,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4406,7 +4406,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4423,7 +4423,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4440,7 +4440,7 @@ mod tests {
             .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
             .await?;
 
-        let writer = tline.writer().await;
+        let mut writer = tline.writer().await;
         writer
             .put(
                 *TEST_KEY,
@@ -4497,7 +4497,7 @@ mod tests {
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
-                let writer = timeline.writer().await;
+                let mut writer = timeline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4690,7 +4690,7 @@ mod tests {
 
         current_lsn += 0x100;
 
-        let writer = current_timeline.writer().await;
+        let mut writer = current_timeline.writer().await;
         writer
             .put(
                 gap_at_key,
@@ -4729,7 +4729,7 @@ mod tests {
 
             current_lsn += 0x10;
 
-            let writer = child_timeline.writer().await;
+            let mut writer = child_timeline.writer().await;
             writer
                 .put(
                     current_key,
@@ -4807,7 +4807,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4828,7 +4828,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -4896,7 +4896,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer().await;
+            let mut writer = tline.writer().await;
             writer
                 .put(
                     test_key,
@@ -4925,7 +4925,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
@@ -5002,7 +5002,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer().await;
+                let mut writer = tline.writer().await;
                 writer
                     .put(
                         test_key,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e7da28b8d6..5f1db21d49 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -336,32 +336,17 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
+
     pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
     }
 
     async fn put_value_locked(
@@ -369,22 +354,16 @@ impl InMemoryLayer {
         locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
         key: Key,
         lsn: Lsn,
-        val: &Value,
+        buf: &[u8],
         ctx: &RequestContext,
     ) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            buf.clear();
-            val.ser_into(&mut buf)?;
             locked_inner
                 .file
                 .write_blob(
-                    &buf,
+                    buf,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -412,7 +391,12 @@ impl InMemoryLayer {
     pub async fn freeze(&self, end_lsn: Lsn) {
         let inner = self.inner.write().await;
 
-        assert!(self.start_lsn < end_lsn);
+        assert!(
+            self.start_lsn < end_lsn,
+            "{} >= {}",
+            self.start_lsn,
+            end_lsn
+        );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 71a958206c..7004db1cb5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,6 +27,18 @@ use pageserver_api::{
 };
 use rand::Rng;
 use serde_with::serde_as;
+use storage_broker::BrokerClientChannel;
+use tokio::{
+    runtime::Handle,
+    sync::{oneshot, watch},
+};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::{
+    bin_ser::BeSer,
+    sync::gate::{Gate, GateGuard},
+};
+
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -41,14 +53,6 @@ use std::{
     cmp::{max, min, Ordering},
     ops::ControlFlow,
 };
-use storage_broker::BrokerClientChannel;
-use tokio::{
-    runtime::Handle,
-    sync::{oneshot, watch},
-};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-use utils::sync::gate::{Gate, GateGuard};
 
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
@@ -271,7 +275,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<()>,
+    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -917,8 +921,6 @@ impl Timeline {
         seq: &Bytes,
         vec: &Bytes,
     ) {
-        use utils::bin_ser::BeSer;
-
         if *key == AUX_FILES_KEY {
             // The value reconstruct of AUX_FILES_KEY from records is not deterministic
             // since it uses a hash map under the hood. Hence, deserialise both results
@@ -1149,58 +1151,10 @@ impl Timeline {
     pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            _write_guard: self.write_lock.lock().await,
+            write_guard: self.write_lock.lock().await,
         }
     }
 
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
-    /// the in-memory layer, and initiate flushing it if so.
-    ///
-    /// Also flush after a period of time without new data -- it helps
-    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
-        let last_lsn = self.get_last_record_lsn();
-        let open_layer_size = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            let Some(open_layer) = layers.open_layer.as_ref() else {
-                return Ok(());
-            };
-            open_layer.size().await?
-        };
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
-        let distance = last_lsn.widening_sub(last_freeze_at);
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if (distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128)
-            || open_layer_size > self.get_checkpoint_distance()
-            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
-        {
-            info!(
-                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
-                distance,
-                open_layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            self.freeze_inmem_layer(true).await;
-            self.last_freeze_at.store(last_lsn);
-            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
-
-            // Wake up the layer flusher
-            self.flush_frozen_layers();
-        }
-        Ok(())
-    }
-
     pub(crate) fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1635,7 +1589,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: tokio::sync::Mutex::new(()),
+                write_lock: tokio::sync::Mutex::new(None),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
@@ -2961,43 +2915,6 @@ impl Timeline {
         Ok(layer)
     }
 
-    async fn put_value(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        //info!("PUT: key {} at {}", key, lsn);
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val, ctx).await?;
-        Ok(())
-    }
-
-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
-        Ok(())
-    }
-
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
@@ -3008,14 +2925,20 @@ impl Timeline {
     async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
+
         let _write_guard = if write_lock_held {
             None
         } else {
             Some(self.write_lock.lock().await)
         };
+
+        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+    }
+
+    async fn freeze_inmem_layer_at(&self, at: Lsn) {
         let mut guard = self.layers.write().await;
         guard
-            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
             .await;
     }
 
@@ -4392,13 +4315,43 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
     PageReconstructError::from(msg)
 }
 
+struct TimelineWriterState {
+    open_layer: Arc<InMemoryLayer>,
+    current_size: u64,
+    // Previous Lsn which passed through
+    prev_lsn: Option<Lsn>,
+    // Largest Lsn which passed through the current writer
+    max_lsn: Option<Lsn>,
+    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
+    cached_last_freeze_at: Lsn,
+    cached_last_freeze_ts: Instant,
+}
+
+impl TimelineWriterState {
+    fn new(
+        open_layer: Arc<InMemoryLayer>,
+        current_size: u64,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> Self {
+        Self {
+            open_layer,
+            current_size,
+            prev_lsn: None,
+            max_lsn: None,
+            cached_last_freeze_at: last_freeze_at,
+            cached_last_freeze_ts: last_freeze_ts,
+        }
+    }
+}
+
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    _write_guard: tokio::sync::MutexGuard<'a, ()>,
+    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
 }
 
 impl Deref for TimelineWriter<'_> {
@@ -4409,31 +4362,239 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
+impl Drop for TimelineWriter<'_> {
+    fn drop(&mut self) {
+        self.write_guard.take();
+    }
+}
+
+#[derive(PartialEq)]
+enum OpenLayerAction {
+    Roll,
+    Open,
+    None,
+}
+
 impl<'a> TimelineWriter<'a> {
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
     pub(crate) async fn put(
-        &self,
+        &mut self,
         key: Key,
         lsn: Lsn,
         value: &Value,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value, ctx).await
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
     }
 
+    /// "Tick" the timeline writer: it will roll the open layer if required
+    /// and do nothing else.
+    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
+        self.open_layer_if_present().await?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let action = self.get_open_layer_action(last_record_lsn, 0);
+        if action == OpenLayerAction::Roll {
+            self.roll_layer(last_record_lsn).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Populate the timeline writer state only if an in-memory layer
+    /// is already open.
+    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_none());
+
+        let open_layer = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            match layers.open_layer {
+                Some(ref open_layer) => open_layer.clone(),
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        let initial_size = open_layer.size().await?;
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            open_layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
+    async fn handle_open_layer_action(
+        &mut self,
+        at: Lsn,
+        action: OpenLayerAction,
+    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
+        match action {
+            OpenLayerAction::Roll => {
+                let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
+                self.roll_layer(freeze_at).await?;
+                self.open_layer(at).await?;
+            }
+            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::None => {
+                assert!(self.write_guard.is_some());
+            }
+        }
+
+        Ok(&self.write_guard.as_ref().unwrap().open_layer)
+    }
+
+    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at).await?;
+        let initial_size = layer.size().await?;
+
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
+    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_some());
+
+        self.tl.freeze_inmem_layer_at(freeze_at).await;
+
+        let now = Instant::now();
+        *(self.last_freeze_ts.write().unwrap()) = now;
+
+        self.tl.flush_frozen_layers();
+
+        let current_size = self.write_guard.as_ref().unwrap().current_size;
+        if current_size > self.get_checkpoint_distance() {
+            warn!("Flushed oversized open layer with size {}", current_size)
+        }
+
+        Ok(())
+    }
+
+    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
+        let state = &*self.write_guard;
+        let Some(state) = &state else {
+            return OpenLayerAction::Open;
+        };
+
+        if state.prev_lsn == Some(lsn) {
+            // Rolling mid LSN is not supported by downstream code.
+            // Hence, only roll at LSN boundaries.
+            return OpenLayerAction::None;
+        }
+
+        if state.current_size == 0 {
+            // Don't roll empty layers
+            return OpenLayerAction::None;
+        }
+
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
+            OpenLayerAction::Roll
+        } else {
+            OpenLayerAction::None
+        }
+    }
+
+    /// Put a batch keys at the specified Lsns.
+    ///
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
     pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        &mut self,
+        batch: Vec<(Key, Lsn, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
+        for (key, lsn, val) in batch {
+            self.put(key, lsn, &val, ctx).await?
+        }
+
+        Ok(())
     }
 
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = batch.first() {
+            let action = self.get_open_layer_action(*lsn, 0);
+            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            layer.put_tombstones(batch).await?;
+        }
+
+        Ok(())
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 9cb53f46d1..8297ca6563 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                             modification.commit(&ctx).await?;
                             uncommitted_records = 0;
                             filtered_records = 0;
-
-                            //
-                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
-                            // layer size can become much larger than `checkpoint_distance`.
-                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
-                            // amount of data to key-value storage. So performing this check only after processing
-                            // all WAL records in the chunk, can cause huge L0 layer files.
-                            //
-                            timeline
-                                .check_checkpoint_distance()
-                                .await
-                                .with_context(|| {
-                                    format!(
-                                        "Failed to check checkpoint distance for timeline {}",
-                                        timeline.timeline_id
-                                    )
-                                })?;
                         }
                     }
 
@@ -406,15 +389,16 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
+        {
+            // This is a hack. It piggybacks on the keepalive messages sent by the
+            // safekeeper in order to enforce `checkpoint_timeout` on the currently
+            // open layer. This hack doesn't provide a bound on the total size of
+            // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
+            let mut writer = timeline.writer().await;
+            if let Err(err) = writer.tick().await {
+                warn!("Timeline writer tick failed: {err}");
+            }
+        }
 
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn = timeline
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 6bd0d85fa2..9b20954d45 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
-            "checkpoint_distance": "8192",
+            "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
-            "compaction_target_size": "8192",
+            "compaction_target_size": "16384",
         }
     )
 

From 2c132e45cb624a39ac7f23ea78f082078277a450 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 8 Mar 2024 07:56:23 +0000
Subject: [PATCH 46/52] proxy: do not store ephemeral endpoints in http pool
 (#6819)

## Problem

For the ephemeral endpoint feature, it's not really too helpful to keep
them around in the connection pool. This isn't really pressing but I
think it's still a bit better this way.

## Summary of changes

Add `is_ephemeral` function to `NeonOptions`. Allow
`serverless::ConnInfo::endpoint_cache_key()` to return an `Option`.
Handle that option appropriately
---
 proxy/src/proxy.rs                |  5 +++++
 proxy/src/serverless/conn_pool.rs | 30 +++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index aeba08bc4f..7848fc2ac2 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -380,6 +380,11 @@ impl NeonOptions {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
+    pub fn is_ephemeral(&self) -> bool {
+        // Currently, neon endpoint options are all reserved for ephemeral endpoints.
+        !self.0.is_empty()
+    }
+
     fn parse_from_iter<'a>(options: impl Iterator<Item = &'a str>) -> Self {
         let mut options = options
             .filter_map(neon_option)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 7d705ba049..73f213d074 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -43,8 +43,13 @@ impl ConnInfo {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
-        self.user_info.endpoint_cache_key()
+    pub fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+        // We don't want to cache http connections for ephemeral endpoints.
+        if self.user_info.options.is_ephemeral() {
+            None
+        } else {
+            Some(self.user_info.endpoint_cache_key())
+        }
     }
 }
 
@@ -360,8 +365,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInner<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
 
-        let endpoint_pool = self.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key());
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
         if let Some(entry) = endpoint_pool
             .write()
             .get_conn_entry(conn_info.db_and_user())
@@ -455,8 +463,10 @@ pub fn poll_client<C: ClientInnerExt>(
     span.in_scope(|| {
         info!(%conn_info, %session_id, "new connection");
     });
-    let pool =
-        Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+    let pool = match conn_info.endpoint_cache_key() {
+        Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
+        None => Weak::new(),
+    };
     let pool_clone = pool.clone();
 
     let db_user = conn_info.db_and_user();
@@ -723,8 +733,9 @@ mod tests {
             dbname: "dbname".into(),
             password: "password".as_bytes().into(),
         };
-        let ep_pool =
-            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
@@ -780,8 +791,9 @@ mod tests {
             dbname: "dbname".into(),
             password: "password".as_bytes().into(),
         };
-        let ep_pool =
-            Arc::downgrade(&pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key()));
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             client.do_drop().unwrap()();

From 7329413705be0939b550553be2f40d4bb11a1a9b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Mar 2024 15:34:53 +0000
Subject: [PATCH 47/52] storage controller: enable setting PlacementPolicy in
 tenant creation (#7037)

## Problem

Tenants created via the storage controller have a `PlacementPolicy` that
defines their HA/secondary/detach intent. For backward compat we can
just set it to Single, for onboarding tenants using /location_conf it is
automatically set to Double(1) if there are at least two pageservers,
but for freshly created tenants we didn't have a way to specify it.

This unblocks writing tests that create HA tenants on the storage
controller and do failure injection testing.

## Summary of changes

- Add optional fields to TenantCreateRequest for specifying
PlacementPolicy. This request structure is used both on pageserver API
and storage controller API, but this method is only meaningful for the
storage controller (same as existing `shard_parameters` attribute).
- Use the value from the creation request in tenant creation, if
provided.
---
 control_plane/attachment_service/src/http.rs  |  7 +--
 control_plane/attachment_service/src/lib.rs   | 25 +--------
 .../attachment_service/src/persistence.rs     | 11 ++--
 .../attachment_service/src/service.rs         | 55 ++++++++++---------
 .../attachment_service/src/tenant_state.rs    |  3 +-
 control_plane/src/bin/neon_local.rs           |  9 ++-
 control_plane/src/pageserver.rs               |  2 +
 libs/pageserver_api/src/controller_api.rs     | 40 ++++++++++++++
 libs/pageserver_api/src/models.rs             |  6 ++
 9 files changed, 92 insertions(+), 66 deletions(-)

diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 384bdcef0c..7e4030b221 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,6 +1,5 @@
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use crate::PlacementPolicy;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -119,13 +118,9 @@ async fn handle_tenant_create(
 
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
 
-    // TODO: enable specifying this.  Using Single as a default helps legacy tests to work (they
-    // have no expectation of HA).
-    let placement_policy = PlacementPolicy::Single;
-
     json_response(
         StatusCode::CREATED,
-        service.tenant_create(create_req, placement_policy).await?,
+        service.tenant_create(create_req).await?,
     )
 }
 
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 7ae7e264c7..796b465c10 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,4 +1,4 @@
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use utils::seqwait::MonotonicCounter;
 
 mod auth;
@@ -13,23 +13,6 @@ mod schema;
 pub mod service;
 mod tenant_state;
 
-#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
-enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
-    /// Create one secondary mode locations. This is useful when onboarding
-    /// a tenant, or for an idle tenant that we might want to bring online quickly.
-    Secondary,
-
-    /// Do not attach to any pageservers.  This is appropriate for tenants that
-    /// have been idle for a long time, where we do not mind some delay in making
-    /// them available in future.
-    Detached,
-}
-
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
 
@@ -66,9 +49,3 @@ impl Sequence {
         Sequence(self.0 + 1)
     }
 }
-
-impl Default for PlacementPolicy {
-    fn default() -> Self {
-        PlacementPolicy::Double(1)
-    }
-}
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index d5c304385c..d5c6d74ebe 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -7,11 +7,9 @@ use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
-use diesel::{
-    Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
-    Selectable, SelectableHelper,
-};
-use pageserver_api::controller_api::NodeSchedulingPolicy;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
@@ -19,7 +17,6 @@ use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 
 use crate::node::Node;
-use crate::PlacementPolicy;
 
 /// ## What do we store?
 ///
@@ -210,7 +207,7 @@ impl Persistence {
                 tenant.tenant_id = tenant_id.to_string();
                 tenant.config = serde_json::to_string(&TenantConfig::default())
                     .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
                     .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
             }
         }
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index f41c4f89b9..556d6a6828 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -16,9 +16,9 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::TenantConfigRequest,
 };
@@ -57,7 +57,7 @@ use crate::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
         ReconcilerWaiter, TenantState,
     },
-    PlacementPolicy, Sequence,
+    Sequence,
 };
 
 // For operations that should be quick, like attaching a new tenant
@@ -176,7 +176,7 @@ impl From<ReconcileWaitError> for ApiError {
 
 #[allow(clippy::large_enum_variant)]
 enum TenantCreateOrUpdate {
-    Create((TenantCreateRequest, PlacementPolicy)),
+    Create(TenantCreateRequest),
     Update(Vec<ShardUpdate>),
 }
 
@@ -792,7 +792,7 @@ impl Service {
                 shard_stripe_size: 0,
                 generation: Some(0),
                 generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
             };
@@ -1053,9 +1053,8 @@ impl Service {
     pub(crate) async fn tenant_create(
         &self,
         create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
     ) -> Result<TenantCreateResponse, ApiError> {
-        let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
+        let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
         Ok(response)
@@ -1064,8 +1063,13 @@ impl Service {
     pub(crate) async fn do_tenant_create(
         &self,
         create_req: TenantCreateRequest,
-        placement_policy: PlacementPolicy,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
+        // As a default, single is convenient for tests that don't choose a policy.
+        let placement_policy = create_req
+            .placement_policy
+            .clone()
+            .unwrap_or(PlacementPolicy::Single);
+
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
         let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
@@ -1339,22 +1343,20 @@ impl Service {
 
             TenantCreateOrUpdate::Create(
                 // Synthesize a creation request
-                (
-                    TenantCreateRequest {
-                        new_tenant_id: TenantShardId::unsharded(tenant_id),
-                        generation,
-                        shard_parameters: ShardParameters {
-                            // Must preserve the incoming shard_count do distinguish unsharded (0)
-                            // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                            count: req.tenant_id.shard_count,
-                            // We only import un-sharded or single-sharded tenants, so stripe
-                            // size can be made up arbitrarily here.
-                            stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
-                        },
-                        config: req.config.tenant_conf,
+                TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation,
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
                     },
-                    placement_policy,
-                ),
+                    placement_policy: Some(placement_policy),
+                    config: req.config.tenant_conf,
+                },
             )
         } else {
             TenantCreateOrUpdate::Update(updates)
@@ -1393,9 +1395,8 @@ impl Service {
             stripe_size: None,
         };
         let waiters = match create_or_update {
-            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
-                let (create_resp, waiters) =
-                    self.do_tenant_create(create_req, placement_policy).await?;
+            TenantCreateOrUpdate::Create(create_req) => {
+                let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
                 result.shards = create_resp
                     .shards
                     .into_iter()
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index ddb9866527..c775736b31 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -5,6 +5,7 @@ use std::{
 };
 
 use crate::{metrics, persistence::TenantShardPersistence};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -28,7 +29,7 @@ use crate::{
         attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
     },
     scheduler::{ScheduleError, Scheduler},
-    service, PlacementPolicy, Sequence,
+    service, Sequence,
 };
 
 /// Serialization helper
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1feec5cd9b..27abcb182a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
@@ -435,6 +435,11 @@ async fn handle_tenant(
             let shard_stripe_size: Option<u32> =
                 create_match.get_one::<u32>("shard-stripe-size").cloned();
 
+            let placement_policy = match create_match.get_one::<String>("placement-policy") {
+                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
+                _ => PlacementPolicy::Single,
+            };
+
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
 
             // If tenant ID was not specified, generate one
@@ -456,6 +461,7 @@ async fn handle_tenant(
                             .map(ShardStripeSize)
                             .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
                     },
+                    placement_policy: Some(placement_policy),
                     config: tenant_conf,
                 })
                 .await?;
@@ -1562,6 +1568,7 @@ fn cli() -> Command {
                     .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
                 .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                 .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant"))
                 )
             .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                 .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b2904c1191..ae1bd60c52 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -429,6 +429,8 @@ impl PageServerNode {
             generation,
             config,
             shard_parameters: ShardParameters::default(),
+            // Placement policy is not meaningful for creations not done via storage controller
+            placement_policy: None,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 64b70a1a51..38e61239c5 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -125,5 +125,45 @@ impl From<NodeSchedulingPolicy> for String {
     }
 }
 
+/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
+/// to create secondary locations.
+#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
+pub enum PlacementPolicy {
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
+    /// Create one secondary mode locations. This is useful when onboarding
+    /// a tenant, or for an idle tenant that we might want to bring online quickly.
+    Secondary,
+
+    /// Do not attach to any pageservers.  This is appropriate for tenants that
+    /// have been idle for a long time, where we do not mind some delay in making
+    /// them available in future.
+    Detached,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use serde_json;
+
+    /// Check stability of PlacementPolicy's serialization
+    #[test]
+    fn placement_policy_encoding() -> anyhow::Result<()> {
+        let v = PlacementPolicy::Double(1);
+        let encoded = serde_json::to_string(&v)?;
+        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
+
+        let v = PlacementPolicy::Single;
+        let encoded = serde_json::to_string(&v)?;
+        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
+        Ok(())
+    }
+}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 57497e3831..fe5bbd1c06 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -21,6 +21,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::controller_api::PlacementPolicy;
 use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -242,6 +243,11 @@ pub struct TenantCreateRequest {
     #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
     pub shard_parameters: ShardParameters,
 
+    // This parameter is only meaningful in requests sent to the storage controller
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
     #[serde(flatten)]
     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

From 86e8c43ddf817c7e3ee112e5c399cc5d60b34f29 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 8 Mar 2024 20:42:35 +0000
Subject: [PATCH 48/52] Add downgrade scripts for neon extension. (#7065)

## Problem

When we start compute with newer version of extension (i.e. 1.2) and
then rollback the release, downgrading the compute version, next compute
start will try to update extension to the latest version available in
neon.control (i.e. 1.1).

Thus we need to provide downgrade scripts like neon--1.2--1.1.sql

These scripts must revert the changes made by the upgrade scripts in the
reverse order. This is necessary to ensure that the next upgrade will
work correctly.

In general, we need to write upgrade and downgrade scripts to be more
robust and add IF EXISTS / CREATE OR REPLACE clauses to all statements
(where applicable).

## Summary of changes
Adds downgrade scripts.
Adds test cases for extension downgrade/upgrade.

fixes #7066

This is a follow-up for
https://app.incident.io/neondb/incidents/167?tab=follow-ups

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Alex Chi Z <iskyzh@gmail.com>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 pgxn/neon/Makefile                         |  2 +-
 pgxn/neon/neon--1.1--1.0.sql               |  6 +++++
 pgxn/neon/neon--1.2--1.1.sql               |  1 +
 pgxn/neon/neon--1.3--1.2.sql               |  1 +
 test_runner/regress/test_neon_extension.py | 31 ++++++++++++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 pgxn/neon/neon--1.1--1.0.sql
 create mode 100644 pgxn/neon/neon--1.2--1.1.sql
 create mode 100644 pgxn/neon/neon--1.3--1.2.sql

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7ea767ec74..0bcb9545a6 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql
new file mode 100644
index 0000000000..e83e3104e8
--- /dev/null
+++ b/pgxn/neon/neon--1.1--1.0.sql
@@ -0,0 +1,6 @@
+-- the order of operations is important here
+-- because the view depends on the function
+
+DROP VIEW IF EXISTS neon_lfc_stats CASCADE;
+
+DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE;
diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql
new file mode 100644
index 0000000000..c9f6a40f73
--- /dev/null
+++ b/pgxn/neon/neon--1.2--1.1.sql
@@ -0,0 +1 @@
+DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE;
diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql
new file mode 100644
index 0000000000..2733a15c75
--- /dev/null
+++ b/pgxn/neon/neon--1.3--1.2.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE;
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 1179a3afe9..e31e1cab51 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -29,3 +29,34 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
             log.info(res)
             assert len(res) == 1
             assert len(res[0]) == 5
+
+
+# Verify that the neon extension can be upgraded/downgraded.
+def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_neon_extension_compatibility")
+
+    endpoint_main = env.endpoints.create("test_neon_extension_compatibility")
+    # don't skip pg_catalog updates - it runs CREATE EXTENSION neon
+    endpoint_main.respec(skip_pg_catalog_updates=False)
+    endpoint_main.start()
+
+    with closing(endpoint_main.connect()) as conn:
+        with conn.cursor() as cur:
+            all_versions = ["1.3", "1.2", "1.1", "1.0"]
+            current_version = "1.3"
+            for idx, begin_version in enumerate(all_versions):
+                for target_version in all_versions[idx + 1 :]:
+                    if current_version != begin_version:
+                        cur.execute(
+                            f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}"
+                        )
+                        current_version = begin_version
+                    # downgrade
+                    cur.execute(
+                        f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}"
+                    )
+                    # upgrade
+                    cur.execute(
+                        f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}"
+                    )

From 4834d22d2d99bb7f9726c1cac3176550cc404e38 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 8 Mar 2024 13:24:30 -0900
Subject: [PATCH 49/52] Revoke REPLICATION (#7052)

## Problem
Currently users can cause problems with replication
## Summary of changes
Don't let them replicate
---
 compute_tools/src/spec.rs              | 16 ++++++++++++++--
 test_runner/regress/test_migrations.py |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 84a5a263af..ba3a84cda8 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             RoleAction::Create => {
                 // This branch only runs when roles are created through the console, so it is
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("running role create query: '{}'", &query);
@@ -805,6 +805,18 @@ $$;"#,
         "",
         "",
         // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
     ];
 
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 3f626c5c7c..526ae14b87 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -15,7 +15,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.wait_for_migrations()
 
-    num_migrations = 8
+    num_migrations = 9
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")

From 74d24582cfe67f4115b54d26e5fb787a221dcae4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 9 Mar 2024 13:37:02 +0100
Subject: [PATCH 50/52] throttling: exclude throttled time from basebackup
 (fixup of #6953) (#7072)

PR #6953 only excluded throttled time from the handle_pagerequests
(aka smgr metrics).

This PR implements the deduction for `basebackup ` queries.

The other page_service methods either don't use Timeline::get
or they aren't used in production.

Found by manually inspecting in [staging
logs](https://neonprod.grafana.net/explore?schemaVersion=1&panes=%7B%22wx8%22:%7B%22datasource%22:%22xHHYY0dVz%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bhostname%3D%5C%22pageserver-0.eu-west-1.aws.neon.build%5C%22%7D%20%7C~%20%60git-env%7CERR%7CWARN%60%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22xHHYY0dVz%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22to%22:%221709919114642%22,%22from%22:%221709904430898%22%7D%7D%7D).
---
 libs/metrics/src/lib.rs                 |  1 -
 libs/metrics/src/metric_vec_duration.rs | 23 ---------
 pageserver/src/metrics.rs               | 63 +++++++++++++++++++++++--
 pageserver/src/page_service.rs          | 50 ++++++++++----------
 4 files changed, 83 insertions(+), 54 deletions(-)
 delete mode 100644 libs/metrics/src/metric_vec_duration.rs

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 744fc18e61..22b0a18933 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs
deleted file mode 100644
index e9a0a65570..0000000000
--- a/libs/metrics/src/metric_vec_duration.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
-
-use std::{future::Future, time::Instant};
-
-pub trait DurationResultObserver {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
-}
-
-pub async fn observe_async_block_duration_by_result<
-    T,
-    E,
-    F: Future<Output = Result<T, E>>,
-    O: DurationResultObserver,
->(
-    observer: &O,
-    block: F,
-) -> Result<T, E> {
-    let start = Instant::now();
-    let result = block.await;
-    let duration = start.elapsed();
-    observer.observe_result(&result, duration);
-    result
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee62ee0367..27e754e999 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,4 @@
 use enum_map::EnumMap;
-use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     })
 });
 
-impl DurationResultObserver for BasebackupQueryTime {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    parent: &'a BasebackupQueryTime,
+    ctx: &'c RequestContext,
+    start: std::time::Instant,
+}
+
+impl BasebackupQueryTime {
+    pub(crate) fn start_recording<'c: 'a, 'a>(
+        &'a self,
+        ctx: &'c RequestContext,
+    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
+        BasebackupQueryTimeOngoingRecording {
+            parent: self,
+            ctx,
+            start,
+        }
+    }
+}
+
+impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(ex_throttled) => ex_throttled,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
         let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
-        metric.observe(duration.as_secs_f64());
+        let metric = self
+            .parent
+            .0
+            .get_metric_with_label_values(&[label_value])
+            .unwrap();
+        metric.observe(ex_throttled.as_secs_f64());
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index dacee41e6e..f3ceb7d3e6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         gzip: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, &ctx).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
                 lsn,
                 prev_lsn,
                 full_backup,
-                &ctx,
+                ctx,
             )
             .await?;
         } else {
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
                 // shutdown the encoder to ensure the gzip footer is written
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
-                    &ctx,
+                    ctx,
                 )
                 .await?;
             }
@@ -1449,25 +1449,25 @@ where
                 false
             };
 
-            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*metrics::BASEBACKUP_QUERY_TIME,
-                async move {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        gzip,
-                        ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    Result::<(), QueryError>::Ok(())
-                },
-            )
-            .await?;
+            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+            let res = async {
+                self.handle_basebackup_request(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    lsn,
+                    None,
+                    false,
+                    gzip,
+                    &ctx,
+                )
+                .await?;
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                Result::<(), QueryError>::Ok(())
+            }
+            .await;
+            metric_recording.observe(&res);
+            res?;
         }
         // return pair of prev_lsn and last_lsn
         else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1563,7 +1563,7 @@ where
                 prev_lsn,
                 true,
                 false,
-                ctx,
+                &ctx,
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;

From b09d68633510bdb12b017fb01ac055ffe7298833 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 9 Mar 2024 15:09:08 +0200
Subject: [PATCH 51/52] fix: on-demand downloads can outlive timeline shutdown
 (#7051)

## Problem

Before this PR, it was possible that on-demand downloads were started
after `Timeline::shutdown()`.

For example, we have observed a walreceiver-connection-handler-initiated
on-demand download that was started after `Timeline::shutdown()`s final
`task_mgr::shutdown_tasks()` call.

The underlying issue is that `task_mgr::shutdown_tasks()` isn't sticky,
i.e., new tasks can be spawned during or after
`task_mgr::shutdown_tasks()`.

Cc: https://github.com/neondatabase/neon/issues/4175 in lieu of a more
specific issue for task_mgr. We already decided we want to get rid of it
anyways.

Original investigation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1709824952465949

## Changes

- enter gate while downloading
- use timeline cancellation token for cancelling download

thereby, fixes #7054

Entering the gate might also remove recent "kept the gate from closing"
in staging.
---
 libs/remote_storage/tests/test_real_s3.rs    | 26 +++++++++++--------
 pageserver/src/task_mgr.rs                   |  3 ---
 pageserver/src/tenant/storage_layer/layer.rs | 27 ++++++++------------
 test_runner/regress/test_tenant_delete.py    |  2 ++
 test_runner/regress/test_timeline_delete.py  |  4 ++-
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index e927b40e80..d8b9824d99 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -17,6 +17,7 @@ use remote_storage::{
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio::io::AsyncBufReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
     ))
     .unwrap();
 
-    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+    let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
 
     {
-        let mut stream = ctx
+        let stream = ctx
             .client
             .download(&path, &cancel)
             .await
             .expect("download succeeds")
             .download_stream;
 
-        let first = stream
-            .next()
-            .await
-            .expect("should have the first blob")
-            .expect("should have succeeded");
+        let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
 
-        tracing::info!(len = first.len(), "downloaded first chunk");
+        let first = reader.fill_buf().await.expect("should have the first blob");
+
+        let len = first.len();
+        tracing::info!(len, "downloaded first chunk");
 
         assert!(
-            first.len() < len,
+            first.len() < file_len,
             "uploaded file is too small, we downloaded all on first chunk"
         );
 
+        reader.consume(len);
+
         cancel.cancel();
 
-        let next = stream.next().await.expect("stream should have more");
+        let next = reader.fill_buf().await;
 
         let e = next.expect_err("expected an error, but got a chunk?");
 
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
                 .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
             "{inner:?}"
         );
+
+        let e = DownloadError::from(e);
+
+        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
     }
 
     let cancel = CancellationToken::new();
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index adaa55c179..275a72c0b0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -272,9 +272,6 @@ pub enum TaskKind {
     // Task that uploads a file to remote storage
     RemoteUploadTask,
 
-    // Task that downloads a file from remote storage
-    RemoteDownloadTask,
-
     // task that handles the initial downloading of all tenants
     InitialLoad,
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 6c46b83622..aabb13b15c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -880,23 +880,18 @@ impl LayerInner {
     ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let task_name = format!("download layer {}", self);
-
         let (tx, rx) = tokio::sync::oneshot::channel();
 
-        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
-        // block tenant::mgr::remove_tenant_from_memory.
-
         let this: Arc<Self> = self.clone();
 
-        crate::task_mgr::spawn(
-            &tokio::runtime::Handle::current(),
-            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_shard_id),
-            Some(self.desc.timeline_id),
-            &task_name,
-            false,
-            async move {
+        let guard = timeline
+            .gate
+            .enter()
+            .map_err(|_| DownloadError::DownloadCancelled)?;
+
+        tokio::task::spawn(async move {
+
+                let _guard = guard;
 
                 let client = timeline
                     .remote_client
@@ -906,7 +901,7 @@ impl LayerInner {
                 let result = client.download_layer_file(
                     &this.desc.filename(),
                     &this.metadata(),
-                    &crate::task_mgr::shutdown_token()
+                    &timeline.cancel
                 )
                 .await;
 
@@ -929,7 +924,6 @@ impl LayerInner {
 
                         tokio::select! {
                             _ = tokio::time::sleep(backoff) => {},
-                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
                             _ = timeline.cancel.cancelled() => {},
                         };
 
@@ -959,11 +953,10 @@ impl LayerInner {
                         }
                     }
                 }
-
-                Ok(())
             }
             .in_current_span(),
         );
+
         match rx.await {
             Ok((Ok(()), permit)) => {
                 if let Some(reason) = self
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c4b4e5fb77..52de889084 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -190,6 +190,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # So by ignoring these instead of waiting for empty upload queue
             # we execute more distinct code paths.
             '.*stopping left-over name="remote upload".*',
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
         ]
     )
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 795110d90b..96a5cc491a 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -213,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             # This happens when timeline remains are cleaned up during loading
             ".*Timeline dir entry become invalid.*",
             # In one of the branches we poll for tenant to become active. Polls can generate this log message:
-            f".*Tenant {env.initial_tenant} is not active*",
+            f".*Tenant {env.initial_tenant} is not active.*",
+            # an on-demand is cancelled by shutdown
+            ".*initial size calculation failed: downloading failed, possibly for shutdown",
         ]
     )
 

From f0a9017008a5ce26a9329042c381d722a7ae5cb7 Mon Sep 17 00:00:00 2001
From: Roman Zaynetdinov <roman@neon.tech>
Date: Mon, 11 Mar 2024 10:10:04 +0200
Subject: [PATCH 52/52] Export db size, deadlocks and changed row metrics
 (#7050)

## Problem

We want to report metrics for the oldest user database.
---
 vm-image-spec.yaml | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c1b7ad533a..5b93088303 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -142,6 +142,51 @@ files:
         query: |
           select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
 
+      - metric_name: pg_stats_userdb
+        type: gauge
+        help: 'Stats for the oldest non-system db'
+        key_labels:
+          - datname
+        value_label: kind
+        values:
+          - db_size
+          - deadlocks
+          # Rows
+          - inserted
+          - updated
+          - deleted
+        # We export stats for only one non-system database. Without this limit
+        # it is too easy to abuse the system by creating lots of databases.
+        # We can try lifting this limit in the future after we understand the needs better.
+        query: |
+          select pg_database_size(datname) as db_size, deadlocks,
+                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+                 datname
+            from pg_stat_database
+           where datname IN (
+             select datname
+               from pg_database
+              where datname <> 'postgres' and not datistemplate
+              order by oid
+              limit 1
+           );
+
+      - metric_name: max_cluster_size
+        type: gauge
+        help: 'neon.max_cluster_size setting'
+        key_labels:
+        values: [max_cluster_size]
+        query: |
+          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+      - metric_name: db_total_size
+        type: gauge
+        help: 'Size of all databases'
+        key_labels:
+        values: [total]
+        query: |
+          select sum(pg_database_size(datname)) as total from pg_database;
+
 build: |
   # Build cgroup-tools
   #