From f212630da2b83ae53448e11577d16c9b6703a316 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 8 Apr 2024 19:01:41 +0100
Subject: [PATCH 001/157] update measured with some more convenient features
 (#7334)

## Problem

Some awkwardness in the measured API.
Missing process metrics.

## Summary of changes

Update measured to use the new convenience setup features.
Added measured-process lib.
Added measured support for libmetrics
---
 Cargo.lock                            | 175 +++++++++++++++++++++-----
 Cargo.toml                            |   3 +-
 libs/metrics/Cargo.toml               |   2 +
 libs/metrics/src/lib.rs               | 146 ++++++++++++++++++++-
 storage_controller/src/http.rs        |  18 ++-
 storage_controller/src/main.rs        |   8 +-
 storage_controller/src/metrics.rs     | 120 +++++-------------
 storage_controller/src/persistence.rs |   6 +-
 workspace_hack/Cargo.toml             |   5 +-
 9 files changed, 345 insertions(+), 138 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 66ff3dedb7..a7e29b1de3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1124,7 +1124,7 @@ version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -1462,12 +1462,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.15"
+version = "0.8.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
-dependencies = [
- "cfg-if",
-]
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "crossterm"
@@ -1840,23 +1837,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
  "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2294,6 +2280,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.3"
@@ -2794,6 +2786,12 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -2848,11 +2846,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.13"
+version = "0.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
+checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
 dependencies = [
  "bytes",
+ "crossbeam-utils",
  "hashbrown 0.14.0",
  "itoa",
  "lasso",
@@ -2865,16 +2864,27 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.13"
+version = "0.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
+checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "measured-process"
+version = "0.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
+dependencies = [
+ "libc",
+ "measured",
+ "procfs 0.16.0",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -2914,8 +2924,10 @@ version = "0.1.0"
 dependencies = [
  "chrono",
  "libc",
+ "measured",
+ "measured-process",
  "once_cell",
- "procfs",
+ "procfs 0.14.2",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -3525,7 +3537,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
- "procfs",
+ "procfs 0.14.2",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -4085,6 +4097,29 @@ dependencies = [
  "rustix 0.36.16",
 ]
 
+[[package]]
+name = "procfs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+ "lazy_static",
+ "procfs-core",
+ "rustix 0.38.28",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+]
+
 [[package]]
 name = "prometheus"
 version = "0.13.3"
@@ -4097,7 +4132,7 @@ dependencies = [
  "libc",
  "memchr",
  "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
  "thiserror",
 ]
 
@@ -4118,7 +4153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
  "bytes",
- "heck",
+ "heck 0.4.1",
  "itertools",
  "lazy_static",
  "log",
@@ -4810,6 +4845,19 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "rustix"
+version = "0.38.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.13",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "rustls"
 version = "0.21.9"
@@ -5670,7 +5718,7 @@ version = "0.24.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "rustversion",
@@ -6930,6 +6978,15 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.42.2"
@@ -6960,6 +7017,21 @@ dependencies = [
  "windows_x86_64_msvc 0.48.0",
 ]
 
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -6972,6 +7044,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -6984,6 +7062,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -6996,6 +7080,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -7008,6 +7098,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -7020,6 +7116,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -7032,6 +7134,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -7044,6 +7152,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+
 [[package]]
 name = "winnow"
 version = "0.4.6"
@@ -7092,7 +7206,6 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
- "hashbrown 0.13.2",
  "hashbrown 0.14.0",
  "hex",
  "hmac",
diff --git a/Cargo.toml b/Cargo.toml
index 3c6077648e..5db6b7016a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -106,7 +106,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
+measured = { version = "0.0.20", features=["lasso"] }
+measured-process = { version = "0.0.20" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index f6a49a0166..0bd804051c 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,11 +10,13 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
+measured.workspace = true
 
 workspace_hack.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
+measured-process.workspace = true
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 22b0a18933..6cff28c0ca 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,6 +4,17 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
+use measured::{
+    label::{LabelGroupVisitor, LabelName, NoLabels},
+    metric::{
+        counter::CounterState,
+        gauge::GaugeState,
+        group::{Encoding, MetricValue},
+        name::{MetricName, MetricNameEncoder},
+        MetricEncoding, MetricFamilyEncoding,
+    },
+    FixedCardinalityLabel, LabelGroup, MetricGroup,
+};
 use once_cell::sync::Lazy;
 use prometheus::core::{
     Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -11,6 +22,7 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
+use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -23,7 +35,6 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
-use prometheus::{Registry, Result};
 
 pub mod launch_timestamp;
 mod wrappers;
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
     INTERNAL_REGISTRY.register(c)
 }
 
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
     0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];
 
+pub struct BuildInfo {
+    pub revision: &'static str,
+    pub build_tag: &'static str,
+}
+
+// todo: allow label group without the set
+impl LabelGroup for BuildInfo {
+    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+        const REVISION: &LabelName = LabelName::from_str("revision");
+        v.write_value(REVISION, &self.revision);
+        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
+        v.write_value(BUILD_TAG, &self.build_tag);
+    }
+}
+
+impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut T,
+    ) -> Result<(), T::Err> {
+        enc.write_help(&name, "Build/version information")?;
+        GaugeState::write_type(&name, enc)?;
+        GaugeState {
+            count: std::sync::atomic::AtomicI64::new(1),
+        }
+        .collect_into(&(), self, name, enc)
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(build_info: BuildInfo))]
+pub struct NeonMetrics {
+    #[cfg(target_os = "linux")]
+    #[metric(namespace = "process")]
+    #[metric(init = measured_process::ProcessCollector::for_self())]
+    process: measured_process::ProcessCollector,
+
+    #[metric(namespace = "libmetrics")]
+    #[metric(init = LibMetrics::new(build_info))]
+    libmetrics: LibMetrics,
+}
+
+#[derive(MetricGroup)]
+#[metric(new(build_info: BuildInfo))]
+pub struct LibMetrics {
+    #[metric(init = build_info)]
+    build_info: BuildInfo,
+
+    #[metric(flatten)]
+    rusage: Rusage,
+
+    serve_count: CollectionCounter,
+}
+
+fn write_gauge<Enc: Encoding>(
+    x: i64,
+    labels: impl LabelGroup,
+    name: impl MetricNameEncoder,
+    enc: &mut Enc,
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
+}
+
+#[derive(Default)]
+struct Rusage;
+
+#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[label(singleton = "io_operation")]
+enum IoOp {
+    Read,
+    Write,
+}
+
+impl<T: Encoding> MetricGroup<T> for Rusage
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
+        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
+
+        let ru = get_rusage_stats();
+
+        enc.write_help(
+            DISK_IO,
+            "Bytes written and read from disk, grouped by the operation (read|write)",
+        )?;
+        GaugeState::write_type(DISK_IO, enc)?;
+        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
+        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
+
+        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
+        GaugeState::write_type(MAXRSS, enc)?;
+        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
+
+        Ok(())
+    }
+}
+
+#[derive(Default)]
+struct CollectionCounter(CounterState);
+
+impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut T,
+    ) -> Result<(), T::Err> {
+        self.0.inc();
+        enc.write_help(&name, "Number of metric requests made")?;
+        self.0.collect_into(&(), NoLabels, name, enc)
+    }
+}
+
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
     let metric = register_int_gauge_vec!(
         "libmetrics_build_info",
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
     .expect("Failed to register build info metric");
     metric.with_label_values(&[revision, build_tag]).set(1);
 }
+const BYTES_IN_BLOCK: i64 = 512;
 
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 fn update_rusage_metrics() {
     let rusage_stats = get_rusage_stats();
 
-    const BYTES_IN_BLOCK: i64 = 512;
     DISK_IO_BYTES
         .with_label_values(&["read"])
         .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec {
         }
     }};
 }
+
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -188,7 +321,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
     ///
     /// An error is returned if the number of label values is not the same as the
     /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<GenericCounterPair<P>> {
         Ok(GenericCounterPair {
             inc: self.inc.get_metric_with_label_values(vals)?,
             dec: self.dec.get_metric_with_label_values(vals)?,
@@ -201,7 +337,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
         self.get_metric_with_label_values(vals).unwrap()
     }
 
-    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
         res[0] = self.inc.remove_label_values(vals);
         res[1] = self.dec.remove_label_values(vals);
     }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c59bcaa174..2e83bbc5ed 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -8,6 +8,7 @@ use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
+use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
     TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
@@ -44,15 +45,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;
 
 /// State available to HTTP request handlers
-#[derive(Clone)]
 pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    neon_metrics: NeonMetrics,
     allowlist_routes: Vec<Uri>,
 }
 
 impl HttpState {
-    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
+    pub fn new(
+        service: Arc<crate::service::Service>,
+        auth: Option<Arc<SwappableJwtAuth>>,
+        build_info: BuildInfo,
+    ) -> Self {
         let allowlist_routes = ["/status", "/ready", "/metrics"]
             .iter()
             .map(|v| v.parse().unwrap())
@@ -60,6 +65,7 @@ impl HttpState {
         Self {
             service,
             auth,
+            neon_metrics: NeonMetrics::new(build_info),
             allowlist_routes,
         }
     }
@@ -672,10 +678,11 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
     })
 }
 
-pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
 
-    let payload = crate::metrics::METRICS_REGISTRY.encode();
+    let state = get_state(&req);
+    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
     let response = Response::builder()
         .status(200)
         .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -704,6 +711,7 @@ where
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router()
         .middleware(prologue_metrics_middleware())
@@ -720,7 +728,7 @@ pub fn make_router(
     }
 
     router
-        .data(Arc::new(HttpState::new(service, auth)))
+        .data(Arc::new(HttpState::new(service, auth, build_info)))
         .get("/metrics", |r| {
             named_request_span(r, measured_metrics_handler, RequestName("metrics"))
         })
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 3c03d6efe8..6466b9f7a3 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -3,6 +3,7 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
+use metrics::BuildInfo;
 use std::sync::Arc;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
@@ -192,6 +193,11 @@ async fn async_main() -> anyhow::Result<()> {
         args.listen
     );
 
+    let build_info = BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    };
+
     let strict_mode = if args.dev {
         StrictMode::Dev
     } else {
@@ -253,7 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
     let auth = secrets
         .public_key
         .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth)
+    let router = make_router(service.clone(), auth, build_info)
         .build()
         .map_err(|err| anyhow!(err))?;
     let router_service = utils::http::RouterService::new(router).unwrap();
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index cabf416b9f..ac9f22c739 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -8,10 +8,8 @@
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
 use bytes::Bytes;
-use measured::{
-    label::{LabelValue, StaticLabelSet},
-    FixedCardinalityLabel, MetricGroup,
-};
+use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
+use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
 
@@ -26,13 +24,15 @@ pub fn preinitialize_metrics() {
 
 pub(crate) struct StorageControllerMetrics {
     pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::TextEncoder>,
+    encoder: Mutex<measured::text::BufferedTextEncoder>,
 }
 
 #[derive(measured::MetricGroup)]
+#[metric(new())]
 pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we spawn a reconcile task
     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup {
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
+
     /// HTTP request handler latency across all status codes
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_http_request_latency:
         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
 
@@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Latency of HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_pageserver_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
@@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_passthrough_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
@@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup {
         measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
 
     /// Latency of database queries, broken down by operation.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_database_query_latency:
         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
 }
 
 impl StorageControllerMetrics {
-    pub(crate) fn encode(&self) -> Bytes {
+    pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
         let mut encoder = self.encoder.lock().unwrap();
-        self.metrics_group.collect_into(&mut *encoder);
+        neon_metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+        self.metrics_group
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
         encoder.finish()
     }
 }
 
 impl Default for StorageControllerMetrics {
     fn default() -> Self {
-        Self {
-            metrics_group: StorageControllerMetricGroup::new(),
-            encoder: Mutex::new(measured::text::TextEncoder::new()),
-        }
-    }
-}
+        let mut metrics_group = StorageControllerMetricGroup::new();
+        metrics_group
+            .storage_controller_reconcile_complete
+            .init_all_dense();
 
-impl StorageControllerMetricGroup {
-    pub(crate) fn new() -> Self {
         Self {
-            storage_controller_reconcile_spawn: measured::Counter::new(),
-            storage_controller_reconcile_complete: measured::CounterVec::new(
-                ReconcileCompleteLabelGroupSet {
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_schedule_optimization: measured::Counter::new(),
-            storage_controller_http_request_status: measured::CounterVec::new(
-                HttpRequestStatusLabelGroupSet {
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_pageserver_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_passthrough_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_database_query_error: measured::CounterVec::new(
-                DatabaseQueryErrorLabelGroupSet {
-                    operation: StaticLabelSet::new(),
-                    error_type: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_database_query_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
+            metrics_group,
+            encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
         }
     }
 }
@@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
     pub(crate) status: StatusCode,
@@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
 }
 
-impl Default for HttpRequestLatencyLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
 }
 
-impl Default for PageserverRequestLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            pageserver_id: lasso::ThreadedRodeo::new(),
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
     pub(crate) operation: DatabaseOperation,
 }
 
-#[derive(FixedCardinalityLabel)]
+#[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
     #[label(rename = "ok")]
     Success,
@@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome {
     Cancel,
 }
 
-#[derive(FixedCardinalityLabel, Clone)]
+#[derive(FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum Method {
     Get,
     Put,
@@ -246,11 +189,12 @@ impl From<hyper::Method> for Method {
     }
 }
 
+#[derive(Clone, Copy)]
 pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
 
 impl LabelValue for StatusCode {
     fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as u64)
+        v.write_int(self.0.as_u16() as i64)
     }
 }
 
@@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode {
     }
 }
 
-#[derive(FixedCardinalityLabel)]
+#[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum DatabaseErrorLabel {
     Query,
     Connection,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 55fbfd10bc..5312e1e218 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -79,7 +79,7 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
-#[derive(measured::FixedCardinalityLabel, Clone)]
+#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum DatabaseOperation {
     InsertNode,
     UpdateNode,
@@ -153,9 +153,7 @@ impl Persistence {
         let latency = &METRICS_REGISTRY
             .metrics_group
             .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
-            operation: op.clone(),
-        });
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
 
         let res = self.with_conn(func).await;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7b8228a082..bcbd4daa7e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,8 +37,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
-hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
+hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -91,7 +90,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
+hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From bcab344490fbb68daf75c98900cdd8e20f6417d6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 9 Apr 2024 10:50:43 +0100
Subject: [PATCH 002/157] CI(flaky-tests): remove outdated restriction (#7345)

## Problem

After switching the default pageserver io-engine to `tokio-epoll-uring`
on CI, we tuned a query that finds flaky tests (in
https://github.com/neondatabase/neon/pull/7077).

It has been almost a month since then, additional query tuning is not
required anymore.

## Summary of changes
- Remove extra condition from flaky tests query
- Also return back parameterisation to the query
---
 scripts/flaky_tests.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 853c67d218..878840fcee 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """
         DISTINCT parent_suite, suite, name
     FROM results
     WHERE
-        started_at > CURRENT_DATE - INTERVAL '10' day
-        AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
+        started_at > CURRENT_DATE - INTERVAL '%s' day
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky

From 4f4f787119c2a353da0a0691714256bec1f82b11 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 9 Apr 2024 12:03:46 +0100
Subject: [PATCH 003/157] Update staging hostname (#7347)

## Problem

```
Could not resolve host: console.stage.neon.tech
```

## Summary of changes
- replace `console.stage.neon.tech` with `console-stage.neon.build`
---
 .github/actions/neon-branch-create/action.yml  | 2 +-
 .github/actions/neon-branch-delete/action.yml  | 2 +-
 .github/actions/neon-project-create/action.yml | 2 +-
 .github/actions/neon-project-delete/action.yml | 2 +-
 scripts/sk_cleanup_tenants/script.py           | 2 +-
 scripts/sk_collect_dumps/readme.md             | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index f1eea34ab9..dea3fc2357 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 outputs:
   dsn:
     description: 'Created Branch DSN (for main database)'
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index f8cd351dd9..8acba7ad00 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 
 runs:
   using: "composite"
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index ae6464990e..7f0e599b97 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
     default: 15
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
   provisioner:
     desctiption: 'k8s-pod or k8s-neonvm'
     default: 'k8s-pod'
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index adc8510a34..b8ec6cac70 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 
 runs:
   using: "composite"
diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py
index fa22433614..c20a4bb830 100644
--- a/scripts/sk_cleanup_tenants/script.py
+++ b/scripts/sk_cleanup_tenants/script.py
@@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str)
 args = parser.parse_args()
 
 access_key = os.getenv("CONSOLE_API_TOKEN")
-endpoint: str = "https://console.stage.neon.tech/api"
+endpoint: str = "https://console-stage.neon.build/api"
 
 trash_dir: Path = args.trash_dir
 dry_run: bool = args.dry_run
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
index 7494a6cb78..5ae55e058b 100644
--- a/scripts/sk_collect_dumps/readme.md
+++ b/scripts/sk_collect_dumps/readme.md
@@ -3,7 +3,7 @@
 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
 ```
 # staging:
-AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
 # prod:
 AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
 # check

From dbac2d2c473f3648251f0a64e36d066f444dfe00 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 02:40:14 +0200
Subject: [PATCH 004/157] Proxy read ids from redis (#7205)

## Problem

Proxy doesn't know about existing endpoints.

## Summary of changes

* Added caching of all available endpoints.
* On the high load, use it before going to cplane.
* Report metrics for the outcome.
* For rate limiter and credentials caching don't distinguish between
`-pooled` and not

TODOs:
* Make metrics more meaningful
* Consider integrating it with the endpoint rate limiter
* Test it together with cplane in preview
---
 proxy/src/auth/backend.rs                     |   4 +-
 proxy/src/bin/proxy.rs                        |  15 +-
 proxy/src/cache.rs                            |   1 +
 proxy/src/cache/endpoints.rs                  | 191 ++++++++++++++++++
 proxy/src/config.rs                           |  69 +++++++
 proxy/src/console/provider.rs                 |  22 +-
 proxy/src/console/provider/neon.rs            |  20 +-
 proxy/src/context.rs                          |  15 +-
 proxy/src/intern.rs                           |  15 ++
 proxy/src/lib.rs                              |  37 ++++
 proxy/src/metrics.rs                          |  12 ++
 proxy/src/proxy.rs                            |   4 +-
 proxy/src/rate_limiter.rs                     |   2 +-
 proxy/src/rate_limiter/limiter.rs             |  10 +-
 proxy/src/redis/cancellation_publisher.rs     |   6 +-
 .../regress/test_proxy_rate_limiter.py        |  84 --------
 16 files changed, 393 insertions(+), 114 deletions(-)
 create mode 100644 proxy/src/cache/endpoints.rs
 delete mode 100644 test_runner/regress/test_proxy_rate_limiter.py

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index e421798067..71e9da18bc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint);
+        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 56a3ef79cd..9302b31d5c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -189,7 +189,9 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -401,6 +403,7 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
+            maintenance_tasks.spawn(api.locks.garbage_collect_worker());
             if let Some(redis_notifications_client) = redis_notifications_client {
                 let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
@@ -410,6 +413,9 @@ async fn main() -> anyhow::Result<()> {
                     args.region.clone(),
                 ));
                 maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                let cache = api.caches.endpoints_cache.clone();
+                let con = redis_notifications_client.clone();
+                maintenance_tasks.spawn(async move { cache.do_read(con).await });
             }
         }
     }
@@ -489,14 +495,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
+                endpoint_cache_config,
             )));
 
             let config::WakeComputeLockOptions {
@@ -507,10 +517,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
+                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch)
                     .unwrap(),
             ));
-            tokio::spawn(locks.garbage_collect_worker(epoch));
 
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index fc5f416395..d1d4087241 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,4 +1,5 @@
 pub mod common;
+pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
new file mode 100644
index 0000000000..9bc019c2d8
--- /dev/null
+++ b/proxy/src/cache/endpoints.rs
@@ -0,0 +1,191 @@
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use dashmap::DashSet;
+use redis::{
+    streams::{StreamReadOptions, StreamReadReply},
+    AsyncCommands, FromRedisValue, Value,
+};
+use serde::Deserialize;
+use tokio::sync::Mutex;
+
+use crate::{
+    config::EndpointCacheConfig,
+    context::RequestMonitoring,
+    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
+    metrics::REDIS_BROKEN_MESSAGES,
+    rate_limiter::GlobalRateLimiter,
+    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    EndpointId, Normalize,
+};
+
+#[derive(Deserialize, Debug, Clone)]
+#[serde(rename_all(deserialize = "snake_case"))]
+pub enum ControlPlaneEventKey {
+    EndpointCreated,
+    BranchCreated,
+    ProjectCreated,
+}
+
+pub struct EndpointsCache {
+    config: EndpointCacheConfig,
+    endpoints: DashSet<EndpointIdInt>,
+    branches: DashSet<BranchIdInt>,
+    projects: DashSet<ProjectIdInt>,
+    ready: AtomicBool,
+    limiter: Arc<Mutex<GlobalRateLimiter>>,
+}
+
+impl EndpointsCache {
+    pub fn new(config: EndpointCacheConfig) -> Self {
+        Self {
+            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
+                config.limiter_info.clone(),
+            ))),
+            config,
+            endpoints: DashSet::new(),
+            branches: DashSet::new(),
+            projects: DashSet::new(),
+            ready: AtomicBool::new(false),
+        }
+    }
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+        if !self.ready.load(Ordering::Acquire) {
+            return true;
+        }
+        // If cache is disabled, just collect the metrics and return.
+        if self.config.disable_cache {
+            ctx.set_rejected(self.should_reject(endpoint));
+            return true;
+        }
+        // If the limiter allows, we don't need to check the cache.
+        if self.limiter.lock().await.check() {
+            return true;
+        }
+        let rejected = self.should_reject(endpoint);
+        ctx.set_rejected(rejected);
+        !rejected
+    }
+    fn should_reject(&self, endpoint: &EndpointId) -> bool {
+        let endpoint = endpoint.normalize();
+        if endpoint.is_endpoint() {
+            !self.endpoints.contains(&EndpointIdInt::from(&endpoint))
+        } else if endpoint.is_branch() {
+            !self
+                .branches
+                .contains(&BranchIdInt::from(&endpoint.as_branch()))
+        } else {
+            !self
+                .projects
+                .contains(&ProjectIdInt::from(&endpoint.as_project()))
+        }
+    }
+    fn insert_event(&self, key: ControlPlaneEventKey, value: String) {
+        // Do not do normalization here, we expect the events to be normalized.
+        match key {
+            ControlPlaneEventKey::EndpointCreated => {
+                self.endpoints.insert(EndpointIdInt::from(&value.into()));
+            }
+            ControlPlaneEventKey::BranchCreated => {
+                self.branches.insert(BranchIdInt::from(&value.into()));
+            }
+            ControlPlaneEventKey::ProjectCreated => {
+                self.projects.insert(ProjectIdInt::from(&value.into()));
+            }
+        }
+    }
+    pub async fn do_read(
+        &self,
+        mut con: ConnectionWithCredentialsProvider,
+    ) -> anyhow::Result<Infallible> {
+        let mut last_id = "0-0".to_string();
+        loop {
+            self.ready.store(false, Ordering::Release);
+            if let Err(e) = con.connect().await {
+                tracing::error!("error connecting to redis: {:?}", e);
+                continue;
+            }
+            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
+                tracing::error!("error reading from redis: {:?}", e);
+            }
+        }
+    }
+    async fn read_from_stream(
+        &self,
+        con: &mut ConnectionWithCredentialsProvider,
+        last_id: &mut String,
+    ) -> anyhow::Result<()> {
+        tracing::info!("reading endpoints/branches/projects from redis");
+        self.batch_read(
+            con,
+            StreamReadOptions::default().count(self.config.initial_batch_size),
+            last_id,
+            true,
+        )
+        .await?;
+        tracing::info!("ready to filter user requests");
+        self.ready.store(true, Ordering::Release);
+        self.batch_read(
+            con,
+            StreamReadOptions::default()
+                .count(self.config.initial_batch_size)
+                .block(self.config.xread_timeout.as_millis() as usize),
+            last_id,
+            false,
+        )
+        .await
+    }
+    fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> {
+        Ok((serde_json::from_str(key)?, String::from_redis_value(value)?))
+    }
+    async fn batch_read(
+        &self,
+        conn: &mut ConnectionWithCredentialsProvider,
+        opts: StreamReadOptions,
+        last_id: &mut String,
+        return_when_finish: bool,
+    ) -> anyhow::Result<()> {
+        let mut total: usize = 0;
+        loop {
+            let mut res: StreamReadReply = conn
+                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
+                .await?;
+            if res.keys.len() != 1 {
+                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
+            }
+
+            let res = res.keys.pop().expect("Checked length above");
+
+            if return_when_finish && res.ids.len() <= self.config.default_batch_size {
+                break;
+            }
+            for x in res.ids {
+                total += 1;
+                for (k, v) in x.map {
+                    let (key, value) = match Self::parse_key_value(&k, &v) {
+                        Ok(x) => x,
+                        Err(e) => {
+                            REDIS_BROKEN_MESSAGES
+                                .with_label_values(&[&self.config.stream_name])
+                                .inc();
+                            tracing::error!("error parsing key-value {k}-{v:?}: {e:?}");
+                            continue;
+                        }
+                    };
+                    self.insert_event(key, value);
+                }
+                if total.is_power_of_two() {
+                    tracing::debug!("endpoints read {}", total);
+                }
+                *last_id = x.id;
+            }
+        }
+        tracing::info!("read {} endpoints/branches/projects from redis", total);
+        Ok(())
+    }
+}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index fc490c7348..3bdfb3cfad 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -313,6 +313,75 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct EndpointCacheConfig {
+    /// Batch size to receive all endpoints on the startup.
+    pub initial_batch_size: usize,
+    /// Batch size to receive endpoints.
+    pub default_batch_size: usize,
+    /// Timeouts for the stream read operation.
+    pub xread_timeout: Duration,
+    /// Stream name to read from.
+    pub stream_name: String,
+    /// Limiter info (to distinguish when to enable cache).
+    pub limiter_info: Vec<RateBucketInfo>,
+    /// Disable cache.
+    /// If true, cache is ignored, but reports all statistics.
+    pub disable_cache: bool,
+}
+
+impl EndpointCacheConfig {
+    /// Default options for [`crate::console::provider::NodeInfoCache`].
+    /// Notice that by default the limiter is empty, which means that cache is disabled.
+    pub const CACHE_DEFAULT_OPTIONS: &'static str =
+        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s";
+
+    /// Parse cache options passed via cmdline.
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut initial_batch_size = None;
+        let mut default_batch_size = None;
+        let mut xread_timeout = None;
+        let mut stream_name = None;
+        let mut limiter_info = vec![];
+        let mut disable_cache = false;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
+                "default_batch_size" => default_batch_size = Some(value.parse()?),
+                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
+                "stream_name" => stream_name = Some(value.to_string()),
+                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
+                "disable_cache" => disable_cache = value.parse()?,
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+        RateBucketInfo::validate(&mut limiter_info)?;
+
+        Ok(Self {
+            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
+            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
+            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
+            stream_name: stream_name.context("missing `stream_name`")?,
+            disable_cache,
+            limiter_info,
+        })
+    }
+}
+
+impl FromStr for EndpointCacheConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse endpoint cache options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index f7d621fb12..ee2bc866ab 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,15 +8,15 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, ProjectInfoCacheOptions},
+    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{sync::Arc, time::Duration};
+use std::{convert::Infallible, sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -416,12 +416,15 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
+    /// List of all valid endpoints.
+    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
+        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -431,6 +434,7 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
+            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
@@ -441,6 +445,7 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
+    epoch: std::time::Duration,
     registered: prometheus::IntCounter,
     unregistered: prometheus::IntCounter,
     reclamation_lag: prometheus::Histogram,
@@ -453,6 +458,7 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
+        epoch: std::time::Duration,
     ) -> prometheus::Result<Self> {
         let registered = prometheus::IntCounter::with_opts(
             prometheus::Opts::new(
@@ -497,6 +503,7 @@ impl ApiLocks {
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
+            epoch,
             lock_acquire_lag,
             registered,
             unregistered,
@@ -536,12 +543,9 @@ impl ApiLocks {
         })
     }
 
-    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
-        if self.permits == 0 {
-            return;
-        }
-
-        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+    pub async fn garbage_collect_worker(&self) -> anyhow::Result<Infallible> {
+        let mut interval =
+            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 1a3e2ca795..3a0e5609d8 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 };
 use crate::{
     auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
+    Normalize,
 };
 use crate::{
     cache::Cached,
@@ -23,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks,
     jwt: String,
 }
 
@@ -55,6 +56,15 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint)
+            .await
+        {
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -81,7 +91,9 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
+                    Some(http::StatusCode::NOT_FOUND) => {
+                        return Ok(AuthInfo::default());
+                    }
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -181,7 +193,7 @@ impl super::Api for Api {
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let ep_int = ep.normalize().into();
             self.caches.project_info.insert_role_secret(
                 project_id,
                 ep_int,
@@ -218,7 +230,7 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let ep_int = ep.normalize().into();
             self.caches.project_info.insert_role_secret(
                 project_id,
                 ep_int,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fec95f4722..85544f1d65 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,7 +12,9 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    metrics::{
+        bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS,
+    },
     DbName, EndpointId, RoleName,
 };
 
@@ -50,6 +52,8 @@ pub struct RequestMonitoring {
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
+    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
+    rejected: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -93,6 +97,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            rejected: false,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -113,6 +118,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_rejected(&mut self, rejected: bool) {
+        self.rejected = rejected;
+    }
+
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -178,6 +187,10 @@ impl RequestMonitoring {
 
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
+        let outcome = if self.success { "success" } else { "failure" };
+        NUM_INVALID_ENDPOINTS
+            .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome])
+            .inc();
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index a6519bdff9..e38135dd22 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<EndpointId> for EndpointIdInt {
+    fn from(value: EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<BranchId> for BranchIdInt {
+    fn from(value: BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<ProjectId> for ProjectIdInt {
+    fn from(value: ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index da7c7f3ed2..3f6d985fe8 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper {
     };
 }
 
+const POOLER_SUFFIX: &str = "-pooler";
+
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+    fn normalize(&self) -> Self {
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
+        } else {
+            self.clone()
+        }
+    }
+}
+
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    pub fn is_project(&self) -> bool {
+        !self.is_endpoint() && !self.is_branch()
+    }
+    pub fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 59ee899c08..f299313e0a 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -169,6 +169,18 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static NUM_INVALID_ENDPOINTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_invalid_endpoints_total",
+        "Number of invalid endpoints (per protocol, per rejected).",
+        // http/ws/tcp, true/false, success/failure
+        // TODO(anna): the last dimension is just a proxy to what we actually want to measure.
+        // We need to measure whether the endpoint was found by cplane or not.
+        &["protocol", "rejected", "outcome"],
+    )
+    .unwrap()
+});
+
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 6051c0a812..166e761a4e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use crate::{
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey,
+    EndpointCacheKey, Normalize,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep, 1) {
+        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 13dffffca0..a3b83e5e50 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index f590896dd9..0503deb311 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -24,13 +24,13 @@ use super::{
     RateLimiterConfig,
 };
 
-pub struct RedisRateLimiter {
+pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
-    info: &'static [RateBucketInfo],
+    info: Vec<RateBucketInfo>,
 }
 
-impl RedisRateLimiter {
-    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+impl GlobalRateLimiter {
+    pub fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -50,7 +50,7 @@ impl RedisRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(self.info)
+            .zip(&self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 422789813c..7baf104374 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: RedisRateLimiter,
+    limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: RedisRateLimiter::new(info),
+            limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
deleted file mode 100644
index f39f0cad07..0000000000
--- a/test_runner/regress/test_proxy_rate_limiter.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import asyncio
-import time
-from pathlib import Path
-from typing import Iterator
-
-import pytest
-from fixtures.neon_fixtures import (
-    PSQL,
-    NeonProxy,
-)
-from fixtures.port_distributor import PortDistributor
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.response import Response
-
-
-def waiting_handler(status_code: int) -> Response:
-    # wait more than timeout to make sure that both (two) connections are open.
-    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
-    time.sleep(2)
-    return Response(status=status_code)
-
-
-@pytest.fixture(scope="function")
-def proxy_with_rate_limit(
-    port_distributor: PortDistributor,
-    neon_binpath: Path,
-    httpserver_listen_address,
-    test_output_dir: Path,
-) -> Iterator[NeonProxy]:
-    """Neon proxy that routes directly to vanilla postgres."""
-
-    proxy_port = port_distributor.get_port()
-    mgmt_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
-    external_http_port = port_distributor.get_port()
-    (host, port) = httpserver_listen_address
-    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    with NeonProxy(
-        neon_binpath=neon_binpath,
-        test_output_dir=test_output_dir,
-        proxy_port=proxy_port,
-        http_port=http_port,
-        mgmt_port=mgmt_port,
-        external_http_port=external_http_port,
-        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
-    ) as proxy:
-        proxy.start()
-        yield proxy
-
-
-@pytest.mark.asyncio
-async def test_proxy_rate_limit(
-    httpserver: HTTPServer,
-    proxy_with_rate_limit: NeonProxy,
-):
-    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
-    # mock control plane service
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: Response(status=200)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(429)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(500)
-    )
-
-    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-    # Limit should be 2.
-
-    # Run two queries in parallel.
-    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
-    await proxy_with_rate_limit.find_auth_link(uri, f1)
-    await proxy_with_rate_limit.find_auth_link(uri, f2)
-
-    # Now limit should be 0.
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-
-    # There last query shouldn't reach the http-server.
-    assert httpserver.assertions == []

From 221414de4b0260056e0961528d46c5141825a0a0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 10 Apr 2024 06:31:28 +0100
Subject: [PATCH 005/157] pageserver: time based rolling based on the first
 write timestamp (#7346)

Problem
Currently, we base our time based layer rolling decision on the last
time we froze a layer. This means that if we roll a layer and then go
idle for longer than the checkpoint timeout the next layer will be
rolled after the first write. This is of course not desirable.

Summary of changes
Record the timepoint of the first write to an open layer and use that
for time based layer rolling decisions. Note that I had to keep
`Timeline::last_freeze_ts` for the sharded tenant disk consistent lsn
skip hack.

Fixes #7241
---
 .../tenant/storage_layer/inmemory_layer.rs    |  8 +++
 pageserver/src/tenant/timeline.rs             | 29 ++++-------
 .../regress/test_pageserver_layer_rolling.py  | 50 ++++++++++++++++---
 3 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 43942ba2db..29751641b4 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -19,6 +19,7 @@ use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
+use std::time::Instant;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
@@ -53,6 +54,8 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     end_lsn: OnceLock<Lsn>,
 
+    opened_at: Instant,
+
     /// The above fields never change, except for `end_lsn`, which is only set once.
     /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
@@ -460,6 +463,7 @@ impl InMemoryLayer {
             tenant_shard_id,
             start_lsn,
             end_lsn: OnceLock::new(),
+            opened_at: Instant::now(),
             inner: RwLock::new(InMemoryLayerInner {
                 index: HashMap::new(),
                 file,
@@ -520,6 +524,10 @@ impl InMemoryLayer {
         Ok(())
     }
 
+    pub(crate) fn get_opened_at(&self) -> Instant {
+        self.opened_at
+    }
+
     pub(crate) async fn tick(&self) -> Option<u64> {
         let mut inner = self.inner.write().await;
         let size = inner.file.len();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d3c8c5f66c..d046a60af4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1257,7 +1257,7 @@ impl Timeline {
             checkpoint_distance,
             self.get_last_record_lsn(),
             self.last_freeze_at.load(),
-            *self.last_freeze_ts.read().unwrap(),
+            open_layer.get_opened_at(),
         ) {
             match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1622,7 +1622,7 @@ impl Timeline {
         checkpoint_distance: u64,
         projected_lsn: Lsn,
         last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
+        opened_at: Instant,
     ) -> bool {
         let distance = projected_lsn.widening_sub(last_freeze_at);
 
@@ -1648,13 +1648,13 @@ impl Timeline {
             );
 
             true
-        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+        } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
             info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                projected_lsn,
-                layer_size,
-                last_freeze_ts.elapsed()
-            );
+                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                    projected_lsn,
+                    layer_size,
+                    opened_at.elapsed()
+                );
 
             true
         } else {
@@ -4703,23 +4703,16 @@ struct TimelineWriterState {
     max_lsn: Option<Lsn>,
     // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
     cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
 }
 
 impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
+    fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
         Self {
             open_layer,
             current_size,
             prev_lsn: None,
             max_lsn: None,
             cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
         }
     }
 }
@@ -4818,12 +4811,10 @@ impl<'a> TimelineWriter<'a> {
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
         self.write_guard.replace(TimelineWriterState::new(
             layer,
             initial_size,
             last_freeze_at,
-            last_freeze_ts,
         ));
 
         Ok(())
@@ -4870,7 +4861,7 @@ impl<'a> TimelineWriter<'a> {
             self.get_checkpoint_distance(),
             lsn,
             state.cached_last_freeze_at,
-            state.cached_last_freeze_ts,
+            state.open_layer.get_opened_at(),
         ) {
             OpenLayerAction::Roll
         } else {
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c7e1e88468..c5dc0f2919 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
-from typing import Tuple
+import time
+from typing import Optional, Tuple
 
 import psutil
 import pytest
@@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000
 CHECKPOINT_TIMEOUT_SECONDS = 60
 
 
-async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
-    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+async def run_worker_for_tenant(
+    env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None
+) -> Lsn:
+    if offset is None:
+        offset = 0
+
     with env.endpoints.create_start("main", tenant_id=tenant) as ep:
         conn = await ep.connect_async()
         try:
             await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
             await conn.execute(
-                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i"
             )
         finally:
             await conn.close(timeout=10)
 
         last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-        return tenant, timeline, last_flush_lsn
+        return last_flush_lsn
+
+
+async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+    last_flush_lsn = await run_worker_for_tenant(env, entries, tenant)
+    return tenant, timeline, last_flush_lsn
 
 
 async def workload(
@@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v):
 
 
 def assert_dirty_bytes_nonzero(env):
-    assert get_dirty_bytes(env) > 0
+    dirty_bytes = get_dirty_bytes(env)
+    assert dirty_bytes > 0
+    return dirty_bytes
 
 
 @pytest.mark.parametrize("immediate_shutdown", [True, False])
@@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     log.info("Waiting for background checkpoints...")
     wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
 
+    # The code below verifies that we do not flush on the first write
+    # after an idle period longer than the checkpoint timeout.
+
+    # Sit quietly for longer than the checkpoint timeout
+    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2)
+
+    # Restart the safekeepers and write a bit of extra data into one tenant
+    for sk in env.safekeepers:
+        sk.start()
+
+    tenant_with_extra_writes = last_flush_lsns[0][0]
+    asyncio.run(
+        run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
+    )
+
+    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+
+    # We shouldn't flush since we've just opened a new layer
+    waited_for = 0
+    while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4:
+        time.sleep(5)
+        waited_for += 5
+
+        assert get_dirty_bytes(env) >= dirty_after_write
+
 
 @pytest.mark.skipif(
     # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is

From fd88d4608c3e8a8cb8579786a7b507a436033efc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 10 Apr 2024 09:12:07 +0200
Subject: [PATCH 006/157] Add command to time travel recover prefixes (#7322)

Adds another tool to the DR toolbox: ability in pagectl to
recover arbitrary prefixes in remote storage. Requires remote storage config,
the prefix, and the travel-to timestamp parameter
to be specified as cli args.
The done-if-after parameter is also supported.

Example invocation (after `aws login --profile dev`):

```
RUST_LOG=remote_storage=debug AWS_PROFILE=dev cargo run -p pagectl time-travel-remote-prefix 'remote_storage = { bucket_name = "neon-test-bucket-name", bucket_region = "us-east-2" }' wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/ 2024-04-05T17:00:00Z
```

This has been written to resolve a customer recovery case:
https://neondb.slack.com/archives/C033RQ5SPDH/p1712256888468009

There is validation of the prefix to prevent accidentially specifying
too generic prefixes, which can cause corruption and data
loss if used wrongly. Still, the validation is not perfect and it is
important that the command is used with caution.
If possible, `time_travel_remote_storage` should
be used instead which has additional checks in place.
---
 Cargo.lock                 |   5 ++
 pageserver/ctl/Cargo.toml  |   5 ++
 pageserver/ctl/src/main.rs | 166 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 175 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index a7e29b1de3..4c2bcf250e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3477,12 +3477,17 @@ dependencies = [
  "camino",
  "clap",
  "git-version",
+ "humantime",
  "pageserver",
+ "pageserver_api",
  "postgres_ffi",
+ "remote_storage",
  "serde",
  "serde_json",
  "svg_fmt",
  "tokio",
+ "tokio-util",
+ "toml_edit",
  "utils",
  "workspace_hack",
 ]
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index c5cd451e8d..843f5dd862 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -12,9 +12,14 @@ bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
+humantime.workspace = true
 pageserver = { path = ".." }
+pageserver_api.workspace = true
+remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
+toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index e73d961e36..1fb75584fc 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,6 +9,11 @@ mod index_part;
 mod layer_map_analyzer;
 mod layers;
 
+use std::{
+    str::FromStr,
+    time::{Duration, SystemTime},
+};
+
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
@@ -20,8 +25,16 @@ use pageserver::{
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
+use remote_storage::{RemotePath, RemoteStorageConfig};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::TimelineId,
+    logging::{self, LogFormat, TracingErrorLayerEnablement},
+    lsn::Lsn,
+    project_git_version,
+};
 
 project_git_version!(GIT_VERSION);
 
@@ -43,6 +56,7 @@ enum Commands {
     #[command(subcommand)]
     IndexPart(IndexPartCmd),
     PrintLayerFile(PrintLayerFileCmd),
+    TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
     DrawTimeline {},
     AnalyzeLayerMap(AnalyzeLayerMapCmd),
     #[command(subcommand)]
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
     path: Utf8PathBuf,
 }
 
+/// Roll back the time for the specified prefix using S3 history.
+///
+/// The command is fairly low level and powerful. Validation is only very light,
+/// so it is more powerful, and thus potentially more dangerous.
+#[derive(Parser)]
+struct TimeTravelRemotePrefixCmd {
+    /// A configuration string for the remote_storage configuration.
+    ///
+    /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
+    config_toml_str: String,
+    /// remote prefix to time travel recover. For safety reasons, we require it to contain
+    /// a timeline or tenant ID in the prefix.
+    prefix: String,
+    /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
+    travel_to: String,
+    /// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
+    /// You can use a few seconds before invoking the command. Same format as `travel_to`.
+    done_if_after: Option<String>,
+}
+
 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
     /// Pageserver data path
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
+    logging::init(
+        LogFormat::Plain,
+        TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )?;
+
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
     let cli = CliOpts::parse();
 
     match cli.command {
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
                 print_layerfile(&cmd.path).await?;
             }
         }
+        Commands::TimeTravelRemotePrefix(cmd) => {
+            let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
+                .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
+
+            let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
+                humantime::parse_rfc3339(done_if_after).map_err(|_e| {
+                    anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
+                })?
+            } else {
+                const SAFETY_MARGIN: Duration = Duration::from_secs(3);
+                tokio::time::sleep(SAFETY_MARGIN).await;
+                // Convert to string representation and back to get rid of sub-second values
+                let done_if_after = SystemTime::now();
+                tokio::time::sleep(SAFETY_MARGIN).await;
+                done_if_after
+            };
+
+            let timestamp = strip_subsecond(timestamp);
+            let done_if_after = strip_subsecond(done_if_after);
+
+            let Some(prefix) = validate_prefix(&cmd.prefix) else {
+                println!("specified prefix '{}' failed validation", cmd.prefix);
+                return Ok(());
+            };
+            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_item = toml_document
+                .get("remote_storage")
+                .expect("need remote_storage");
+            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let cancel = CancellationToken::new();
+            storage
+                .unwrap()
+                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
+                .await?;
+        }
     };
     Ok(())
 }
@@ -185,3 +263,89 @@ fn handle_metadata(
 
     Ok(())
 }
+
+/// Ensures that the given S3 prefix is sufficiently constrained.
+/// The command is very risky already and we don't want to expose something
+/// that allows usually unintentional and quite catastrophic time travel of
+/// an entire bucket, which would be a major catastrophy and away
+/// by only one character change (similar to "rm -r /home /username/foobar").
+fn validate_prefix(prefix: &str) -> Option<RemotePath> {
+    if prefix.is_empty() {
+        // Empty prefix means we want to specify the *whole* bucket
+        return None;
+    }
+    let components = prefix.split('/').collect::<Vec<_>>();
+    let (last, components) = {
+        let last = components.last()?;
+        if last.is_empty() {
+            (
+                components.iter().nth_back(1)?,
+                &components[..(components.len() - 1)],
+            )
+        } else {
+            (last, &components[..])
+        }
+    };
+    'valid: {
+        if let Ok(_timeline_id) = TimelineId::from_str(last) {
+            // Ends in either a tenant or timeline ID
+            break 'valid;
+        }
+        if *last == "timelines" {
+            if let Some(before_last) = components.iter().nth_back(1) {
+                if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
+                    // Has a valid tenant id
+                    break 'valid;
+                }
+            }
+        }
+
+        return None;
+    }
+    RemotePath::from_string(prefix).ok()
+}
+
+fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
+    let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
+    humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_validate_prefix() {
+        assert_eq!(validate_prefix(""), None);
+        assert_eq!(validate_prefix("/"), None);
+        #[track_caller]
+        fn assert_valid(prefix: &str) {
+            let remote_path = RemotePath::from_string(prefix).unwrap();
+            assert_eq!(validate_prefix(prefix), Some(remote_path));
+        }
+        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
+        // Path is not relative but absolute
+        assert_eq!(
+            validate_prefix(
+                "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
+            ),
+            None
+        );
+        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
+        // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
+        assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
+        assert_eq!(validate_prefix("wal"), None);
+        assert_eq!(validate_prefix("/wal/"), None);
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
+        // Partial tenant ID
+        assert_eq!(
+            validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
+            None
+        );
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
+        assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
+    }
+}

From c0ff4f18dcb60d2b8035a8d83b693e5e81ceaeff Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 10 Apr 2024 09:23:59 +0100
Subject: [PATCH 007/157] proxy: hyper1 for only proxy (#7073)

## Problem

hyper1 offers control over the HTTP connection that hyper0_14 does not.
We're blocked on switching all services to hyper1 because of how we use
tonic, but no reason we can't switch proxy over.

## Summary of changes

1. hyper0.14 -> hyper1
    1. self managed server
    2. Remove the `WithConnectionGuard` wrapper from `protocol2`
2. Remove TLS listener as it's no longer necessary
3. include first session ID in connection startup logs
---
 Cargo.lock                            | 214 +++++++++++++----
 Cargo.toml                            |   3 +-
 proxy/Cargo.toml                      |   4 +
 proxy/src/protocol2.rs                | 105 +--------
 proxy/src/serverless.rs               | 315 ++++++++++++++------------
 proxy/src/serverless/http_util.rs     |  92 ++++++++
 proxy/src/serverless/sql_over_http.rs |  44 ++--
 proxy/src/serverless/tls_listener.rs  | 123 ----------
 workspace_hack/Cargo.toml             |   3 +-
 9 files changed, 458 insertions(+), 445 deletions(-)
 create mode 100644 proxy/src/serverless/http_util.rs
 delete mode 100644 proxy/src/serverless/tls_listener.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4c2bcf250e..bdf2b08c5c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,6 +270,12 @@ dependencies = [
  "critical-section",
 ]
 
+[[package]]
+name = "atomic-take"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -298,7 +304,7 @@ dependencies = [
  "fastrand 2.0.0",
  "hex",
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "ring 0.17.6",
  "time",
  "tokio",
@@ -335,7 +341,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "percent-encoding",
  "pin-project-lite",
  "tracing",
@@ -386,7 +392,7 @@ dependencies = [
  "aws-types",
  "bytes",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "once_cell",
  "percent-encoding",
  "regex-lite",
@@ -514,7 +520,7 @@ dependencies = [
  "crc32fast",
  "hex",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "md-5",
  "pin-project-lite",
  "sha1",
@@ -546,7 +552,7 @@ dependencies = [
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -585,10 +591,10 @@ dependencies = [
  "aws-smithy-types",
  "bytes",
  "fastrand 2.0.0",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-rustls",
  "once_cell",
  "pin-project-lite",
@@ -626,7 +632,7 @@ dependencies = [
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "itoa",
  "num-integer",
  "pin-project-lite",
@@ -675,8 +681,8 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "itoa",
  "matchit",
  "memchr",
@@ -691,7 +697,7 @@ dependencies = [
  "sha1",
  "sync_wrapper",
  "tokio",
- "tokio-tungstenite",
+ "tokio-tungstenite 0.20.0",
  "tower",
  "tower-layer",
  "tower-service",
@@ -707,7 +713,7 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "mime",
  "rustversion",
  "tower-layer",
@@ -1196,7 +1202,7 @@ dependencies = [
  "compute_api",
  "flate2",
  "futures",
- "hyper",
+ "hyper 0.14.26",
  "nix 0.27.1",
  "notify",
  "num_cpus",
@@ -1313,7 +1319,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "nix 0.27.1",
  "once_cell",
  "pageserver_api",
@@ -2199,6 +2205,25 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "h2"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http 1.1.0",
+ "indexmap 2.0.1",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "half"
 version = "1.8.2"
@@ -2370,6 +2395,29 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "http-body"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "http-types"
 version = "2.12.0"
@@ -2428,9 +2476,9 @@ dependencies = [
  "futures-channel",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "httparse",
  "httpdate",
  "itoa",
@@ -2442,6 +2490,26 @@ dependencies = [
  "want",
 ]
 
+[[package]]
+name = "hyper"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "h2 0.4.4",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+]
+
 [[package]]
 name = "hyper-rustls"
 version = "0.24.0"
@@ -2449,7 +2517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "log",
  "rustls 0.21.9",
  "rustls-native-certs 0.6.2",
@@ -2463,7 +2531,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper",
+ "hyper 0.14.26",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -2476,7 +2544,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
  "bytes",
- "hyper",
+ "hyper 0.14.26",
  "native-tls",
  "tokio",
  "tokio-native-tls",
@@ -2484,15 +2552,33 @@ dependencies = [
 
 [[package]]
 name = "hyper-tungstenite"
-version = "0.11.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
+checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
 dependencies = [
- "hyper",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
  "pin-project-lite",
  "tokio",
- "tokio-tungstenite",
- "tungstenite",
+ "tokio-tungstenite 0.21.0",
+ "tungstenite 0.21.0",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "hyper 1.2.0",
+ "pin-project-lite",
+ "socket2 0.5.5",
+ "tokio",
 ]
 
 [[package]]
@@ -3523,7 +3609,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "leaky-bucket",
  "md5",
@@ -4202,6 +4288,7 @@ dependencies = [
  "anyhow",
  "async-compression",
  "async-trait",
+ "atomic-take",
  "aws-config",
  "aws-sdk-iam",
  "aws-sigv4",
@@ -4225,9 +4312,12 @@ dependencies = [
  "hmac",
  "hostname",
  "http 1.1.0",
+ "http-body-util",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
+ "hyper 1.2.0",
  "hyper-tungstenite",
+ "hyper-util",
  "ipnet",
  "itertools",
  "lasso",
@@ -4560,7 +4650,7 @@ dependencies = [
  "futures-util",
  "http-types",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "metrics",
  "once_cell",
@@ -4590,10 +4680,10 @@ dependencies = [
  "encoding_rs",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-rustls",
  "hyper-tls",
  "ipnet",
@@ -4651,7 +4741,7 @@ dependencies = [
  "futures",
  "getrandom 0.2.11",
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "parking_lot 0.11.2",
  "reqwest",
  "reqwest-middleware",
@@ -4738,7 +4828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "lazy_static",
  "percent-encoding",
  "regex",
@@ -5043,7 +5133,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5528,9 +5618,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
 
 [[package]]
 name = "smol_str"
@@ -5622,7 +5712,7 @@ dependencies = [
  "futures-util",
  "git-version",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5653,7 +5743,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "lasso",
  "measured",
@@ -5682,7 +5772,7 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
- "hyper",
+ "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
  "reqwest",
@@ -6165,7 +6255,19 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite",
+ "tungstenite 0.20.1",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.21.0",
 ]
 
 [[package]]
@@ -6232,10 +6334,10 @@ dependencies = [
  "bytes",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -6421,7 +6523,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper",
+ "hyper 0.14.26",
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
@@ -6458,6 +6560,25 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -6623,7 +6744,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "jsonwebtoken",
  "leaky-bucket",
  "metrics",
@@ -7214,7 +7335,7 @@ dependencies = [
  "hashbrown 0.14.0",
  "hex",
  "hmac",
- "hyper",
+ "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools",
  "libc",
@@ -7252,7 +7373,6 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
- "tungstenite",
  "url",
  "uuid",
  "zeroize",
diff --git a/Cargo.toml b/Cargo.toml
index 5db6b7016a..feea17ab05 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
+atomic-take = "1.1.0"
 azure_core = "0.18"
 azure_identity = "0.18"
 azure_storage = "0.18"
@@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.11"
+hyper-tungstenite = "0.13.0"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b327890be2..12bd67ea36 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -12,6 +12,7 @@ testing = []
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
+atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
@@ -36,6 +37,9 @@ http.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
+hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
+hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
+http-body-util = { version = "0.1" }
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 700c8c8681..70f9b4bfab 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -5,19 +5,13 @@ use std::{
     io,
     net::SocketAddr,
     pin::{pin, Pin},
-    sync::Mutex,
     task::{ready, Context, Poll},
 };
 
 use bytes::{Buf, BytesMut};
-use hyper::server::accept::Accept;
-use hyper::server::conn::{AddrIncoming, AddrStream};
-use metrics::IntCounterPairGuard;
+use hyper::server::conn::AddrIncoming;
 use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
-use uuid::Uuid;
-
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 
 pub struct ProxyProtocolAccept {
     pub incoming: AddrIncoming,
@@ -331,103 +325,6 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
     }
 }
 
-impl Accept for ProxyProtocolAccept {
-    type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
-
-    type Error = io::Error;
-
-    fn poll_accept(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
-        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-
-        let conn_id = uuid::Uuid::new_v4();
-        let span = tracing::info_span!("http_conn", ?conn_id);
-        {
-            let _enter = span.enter();
-            tracing::info!("accepted new TCP connection");
-        }
-
-        let Some(conn) = conn else {
-            return Poll::Ready(None);
-        };
-
-        Poll::Ready(Some(Ok(WithConnectionGuard {
-            inner: WithClientIp::new(conn),
-            connection_id: Uuid::new_v4(),
-            gauge: Mutex::new(Some(
-                NUM_CLIENT_CONNECTION_GAUGE
-                    .with_label_values(&[self.protocol])
-                    .guard(),
-            )),
-            span,
-        })))
-    }
-}
-
-pin_project! {
-    pub struct WithConnectionGuard<T> {
-        #[pin]
-        pub inner: T,
-        pub connection_id: Uuid,
-        pub gauge: Mutex<Option<IntCounterPairGuard>>,
-        pub span: tracing::Span,
-    }
-
-    impl<S> PinnedDrop for WithConnectionGuard<S> {
-        fn drop(this: Pin<&mut Self>) {
-            let _enter = this.span.enter();
-            tracing::info!("HTTP connection closed")
-        }
-    }
-}
-
-impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
-    #[inline]
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write(cx, buf)
-    }
-
-    #[inline]
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_flush(cx)
-    }
-
-    #[inline]
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_shutdown(cx)
-    }
-
-    #[inline]
-    fn poll_write_vectored(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        bufs: &[io::IoSlice<'_>],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write_vectored(cx, bufs)
-    }
-
-    #[inline]
-    fn is_write_vectored(&self) -> bool {
-        self.inner.is_write_vectored()
-    }
-}
-
-impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
-    fn poll_read(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        self.project().inner.poll_read(cx, buf)
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use std::pin::pin;
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a2010fd613..f275caa7eb 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -4,42 +4,48 @@
 
 mod backend;
 mod conn_pool;
+mod http_util;
 mod json;
 mod sql_over_http;
-pub mod tls_listener;
 mod websocket;
 
+use atomic_take::AtomicTake;
+use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
 
-use anyhow::bail;
-use hyper::StatusCode;
-use metrics::IntCounterPairGuard;
+use anyhow::Context;
+use futures::future::{select, Either};
+use futures::TryFutureExt;
+use http::{Method, Response, StatusCode};
+use http_body_util::Full;
+use hyper1::body::Incoming;
+use hyper_util::rt::TokioExecutor;
+use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::timeout;
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;
-use tracing::instrument::Instrumented;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
+use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES};
+use crate::protocol2::WithClientIp;
+use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use hyper::{
-    server::conn::{AddrIncoming, AddrStream},
-    Body, Method, Request, Response,
-};
+use crate::serverless::http_util::{api_error_into_response, json_response};
 
-use std::net::IpAddr;
+use std::net::{IpAddr, SocketAddr};
+use std::pin::pin;
 use std::sync::Arc;
-use std::task::Poll;
-use tls_listener::TlsListener;
-use tokio::net::TcpListener;
-use tokio_util::sync::{CancellationToken, DropGuard};
+use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
-use utils::http::{error::ApiError, json::json_response};
+use utils::http::error::ApiError;
 
 pub const SERVERLESS_DRIVER_SNI: &str = "api";
 
@@ -91,161 +97,174 @@ pub async fn task_main(
     tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
     let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
-    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
-    let _ = addr_incoming.set_nodelay(true);
-    let addr_incoming = ProxyProtocolAccept {
-        incoming: addr_incoming,
-        protocol: "http",
-    };
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    connections.close(); // allows `connections.wait to complete`
 
-    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
-    ws_connections.close(); // allows `ws_connections.wait to complete`
+    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
 
-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
+    while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
+        let (conn, peer_addr) = res.context("could not accept TCP stream")?;
+        if let Err(e) = conn.set_nodelay(true) {
+            tracing::error!("could not set nodelay: {e}");
+            continue;
+        }
+        let conn_id = uuid::Uuid::new_v4();
+        let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
 
-    let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<
-            WithConnectionGuard<WithClientIp<AddrStream>>,
-        >| {
-            let (conn, _) = stream.get_ref();
+        connections.spawn(
+            connection_handler(
+                config,
+                backend.clone(),
+                connections.clone(),
+                cancellation_handler.clone(),
+                endpoint_rate_limiter.clone(),
+                cancellation_token.clone(),
+                server.clone(),
+                tls_acceptor.clone(),
+                conn,
+                peer_addr,
+            )
+            .instrument(http_conn_span),
+        );
+    }
 
-            // this is jank. should dissapear with hyper 1.0 migration.
-            let gauge = conn
-                .gauge
-                .lock()
-                .expect("lock should not be poisoned")
-                .take()
-                .expect("gauge should be set on connection start");
-
-            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
-            let http_cancellation_token = CancellationToken::new();
-            let cancel_connection = http_cancellation_token.clone().drop_guard();
-
-            let span = conn.span.clone();
-            let client_addr = conn.inner.client_addr();
-            let remote_addr = conn.inner.inner.remote_addr();
-            let backend = backend.clone();
-            let ws_connections = ws_connections.clone();
-            let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-            let cancellation_handler = cancellation_handler.clone();
-            async move {
-                let peer_addr = match client_addr {
-                    Some(addr) => addr,
-                    None if config.require_client_ip => bail!("missing required client ip"),
-                    None => remote_addr,
-                };
-                Ok(MetricService::new(
-                    hyper::service::service_fn(move |req: Request<Body>| {
-                        let backend = backend.clone();
-                        let ws_connections2 = ws_connections.clone();
-                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-                        let cancellation_handler = cancellation_handler.clone();
-                        let http_cancellation_token = http_cancellation_token.child_token();
-
-                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
-                        // By spawning the future, we ensure it never gets cancelled until it decides to.
-                        ws_connections.spawn(
-                            async move {
-                                // Cancel the current inflight HTTP request if the requets stream is closed.
-                                // This is slightly different to `_cancel_connection` in that
-                                // h2 can cancel individual requests with a `RST_STREAM`.
-                                let _cancel_session = http_cancellation_token.clone().drop_guard();
-
-                                let res = request_handler(
-                                    req,
-                                    config,
-                                    backend,
-                                    ws_connections2,
-                                    cancellation_handler,
-                                    peer_addr.ip(),
-                                    endpoint_rate_limiter,
-                                    http_cancellation_token,
-                                )
-                                .await
-                                .map_or_else(|e| e.into_response(), |r| r);
-
-                                _cancel_session.disarm();
-
-                                res
-                            }
-                            .in_current_span(),
-                        )
-                    }),
-                    gauge,
-                    cancel_connection,
-                    span,
-                ))
-            }
-        },
-    );
-
-    hyper::Server::builder(tls_listener)
-        .serve(make_svc)
-        .with_graceful_shutdown(cancellation_token.cancelled())
-        .await?;
-
-    // await websocket connections
-    ws_connections.wait().await;
+    connections.wait().await;
 
     Ok(())
 }
 
-struct MetricService<S> {
-    inner: S,
-    _gauge: IntCounterPairGuard,
-    _cancel: DropGuard,
-    span: tracing::Span,
-}
+/// Handles the TCP lifecycle.
+///
+/// 1. Parses PROXY protocol V2
+/// 2. Handles TLS handshake
+/// 3. Handles HTTP connection
+///     1. With graceful shutdowns
+///     2. With graceful request cancellation with connection failure
+///     3. With websocket upgrade support.
+#[allow(clippy::too_many_arguments)]
+async fn connection_handler(
+    config: &'static ProxyConfig,
+    backend: Arc<PoolingBackend>,
+    connections: TaskTracker,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_token: CancellationToken,
+    server: Builder<TokioExecutor>,
+    tls_acceptor: TlsAcceptor,
+    conn: TcpStream,
+    peer_addr: SocketAddr,
+) {
+    let session_id = uuid::Uuid::new_v4();
 
-impl<S> MetricService<S> {
-    fn new(
-        inner: S,
-        _gauge: IntCounterPairGuard,
-        _cancel: DropGuard,
-        span: tracing::Span,
-    ) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge,
-            _cancel,
-            span,
+    let _gauge = NUM_CLIENT_CONNECTION_GAUGE
+        .with_label_values(&["http"])
+        .guard();
+
+    // handle PROXY protocol
+    let mut conn = WithClientIp::new(conn);
+    let peer = match conn.wait_for_addr().await {
+        Ok(peer) => peer,
+        Err(e) => {
+            tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+            return;
         }
-    }
-}
+    };
 
-impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
-where
-    S: hyper::service::Service<Request<ReqBody>>,
-{
-    type Response = S::Response;
-    type Error = S::Error;
-    type Future = Instrumented<S::Future>;
+    let peer_addr = peer.unwrap_or(peer_addr).ip();
+    info!(?session_id, %peer_addr, "accepted new TCP connection");
 
-    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
-        self.inner.poll_ready(cx)
-    }
+    // try upgrade to TLS, but with a timeout.
+    let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
+        Ok(Ok(conn)) => {
+            info!(?session_id, %peer_addr, "accepted new TLS connection");
+            conn
+        }
+        // The handshake failed
+        Ok(Err(e)) => {
+            TLS_HANDSHAKE_FAILURES.inc();
+            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            return;
+        }
+        // The handshake timed out
+        Err(e) => {
+            TLS_HANDSHAKE_FAILURES.inc();
+            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            return;
+        }
+    };
 
-    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.span
-            .in_scope(|| self.inner.call(req))
-            .instrument(self.span.clone())
+    let session_id = AtomicTake::new(session_id);
+
+    // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+    let http_cancellation_token = CancellationToken::new();
+    let _cancel_connection = http_cancellation_token.clone().drop_guard();
+
+    let conn = server.serve_connection_with_upgrades(
+        hyper_util::rt::TokioIo::new(conn),
+        hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
+            // First HTTP request shares the same session ID
+            let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+
+            // Cancel the current inflight HTTP request if the requets stream is closed.
+            // This is slightly different to `_cancel_connection` in that
+            // h2 can cancel individual requests with a `RST_STREAM`.
+            let http_request_token = http_cancellation_token.child_token();
+            let cancel_request = http_request_token.clone().drop_guard();
+
+            // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+            // By spawning the future, we ensure it never gets cancelled until it decides to.
+            let handler = connections.spawn(
+                request_handler(
+                    req,
+                    config,
+                    backend.clone(),
+                    connections.clone(),
+                    cancellation_handler.clone(),
+                    session_id,
+                    peer_addr,
+                    endpoint_rate_limiter.clone(),
+                    http_request_token,
+                )
+                .in_current_span()
+                .map_ok_or_else(api_error_into_response, |r| r),
+            );
+
+            async move {
+                let res = handler.await;
+                cancel_request.disarm();
+                res
+            }
+        }),
+    );
+
+    // On cancellation, trigger the HTTP connection handler to shut down.
+    let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
+        Either::Left((_cancelled, mut conn)) => {
+            conn.as_mut().graceful_shutdown();
+            conn.await
+        }
+        Either::Right((res, _)) => res,
+    };
+
+    match res {
+        Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
+        Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 async fn request_handler(
-    mut request: Request<Body>,
+    mut request: hyper1::Request<Incoming>,
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let session_id = uuid::Uuid::new_v4();
-
+) -> Result<Response<Full<Bytes>>, ApiError> {
     let host = request
         .headers()
         .get("host")
@@ -282,14 +301,14 @@ async fn request_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
-    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+    } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
         let span = ctx.span.clone();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
             .await
-    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+    } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
             .header("Access-Control-Allow-Origin", "*")
@@ -299,7 +318,7 @@ async fn request_handler(
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
+            .body(Full::new(Bytes::new()))
             .map_err(|e| ApiError::InternalServerError(e.into()))
     } else {
         json_response(StatusCode::BAD_REQUEST, "query is not supported")
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
new file mode 100644
index 0000000000..ab9127b13e
--- /dev/null
+++ b/proxy/src/serverless/http_util.rs
@@ -0,0 +1,92 @@
+//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
+//! Will merge back in at some point in the future.
+
+use bytes::Bytes;
+
+use anyhow::Context;
+use http::{Response, StatusCode};
+use http_body_util::Full;
+
+use serde::Serialize;
+use utils::http::error::ApiError;
+
+/// Like [`ApiError::into_response`]
+pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
+    match this {
+        ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
+            format!("{err:#?}"), // use debug printing so that we give the cause
+            StatusCode::BAD_REQUEST,
+        ),
+        ApiError::Forbidden(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN)
+        }
+        ApiError::Unauthorized(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED)
+        }
+        ApiError::NotFound(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND)
+        }
+        ApiError::Conflict(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT)
+        }
+        ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
+            this.to_string(),
+            StatusCode::PRECONDITION_FAILED,
+        ),
+        ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+            "Shutting down".to_string(),
+            StatusCode::SERVICE_UNAVAILABLE,
+        ),
+        ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::SERVICE_UNAVAILABLE,
+        ),
+        ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::REQUEST_TIMEOUT,
+        ),
+        ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::INTERNAL_SERVER_ERROR,
+        ),
+    }
+}
+
+/// Same as [`utils::http::error::HttpErrorBody`]
+#[derive(Serialize)]
+struct HttpErrorBody {
+    pub msg: String,
+}
+
+impl HttpErrorBody {
+    /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
+    fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response<Full<Bytes>> {
+        HttpErrorBody { msg }.to_response(status)
+    }
+
+    /// Same as [`utils::http::error::HttpErrorBody::to_response`]
+    fn to_response(&self, status: StatusCode) -> Response<Full<Bytes>> {
+        Response::builder()
+            .status(status)
+            .header(http::header::CONTENT_TYPE, "application/json")
+            // we do not have nested maps with non string keys so serialization shouldn't fail
+            .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap())))
+            .unwrap()
+    }
+}
+
+/// Same as [`utils::http::json::json_response`]
+pub fn json_response<T: Serialize>(
+    status: StatusCode,
+    data: T,
+) -> Result<Response<Full<Bytes>>, ApiError> {
+    let json = serde_json::to_string(&data)
+        .context("Failed to serialize JSON response")
+        .map_err(ApiError::InternalServerError)?;
+    let response = Response::builder()
+        .status(status)
+        .header(http::header::CONTENT_TYPE, "application/json")
+        .body(Full::new(Bytes::from(json)))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
+}
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 00dffd5784..7f7f93988c 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,18 +1,22 @@
 use std::pin::pin;
 use std::sync::Arc;
 
+use bytes::Bytes;
 use futures::future::select;
 use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
-use hyper::body::HttpBody;
-use hyper::header;
-use hyper::http::HeaderName;
-use hyper::http::HeaderValue;
-use hyper::Response;
-use hyper::StatusCode;
-use hyper::{Body, HeaderMap, Request};
+use http_body_util::BodyExt;
+use http_body_util::Full;
+use hyper1::body::Body;
+use hyper1::body::Incoming;
+use hyper1::header;
+use hyper1::http::HeaderName;
+use hyper1::http::HeaderValue;
+use hyper1::Response;
+use hyper1::StatusCode;
+use hyper1::{HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -29,7 +33,6 @@ use tracing::error;
 use tracing::info;
 use url::Url;
 use utils::http::error::ApiError;
-use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
@@ -52,6 +55,7 @@ use crate::RoleName;
 use super::backend::PoolingBackend;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
+use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
 use super::json::JsonConversionError;
@@ -218,10 +222,10 @@ fn get_conn_info(
 pub async fn handle(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
+) -> Result<Response<Full<Bytes>>, ApiError> {
     let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
 
     let mut response = match result {
@@ -332,10 +336,9 @@ pub async fn handle(
         }
     };
 
-    response.headers_mut().insert(
-        "Access-Control-Allow-Origin",
-        hyper::http::HeaderValue::from_static("*"),
-    );
+    response
+        .headers_mut()
+        .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
     Ok(response)
 }
 
@@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError {
 #[derive(Debug, thiserror::Error)]
 pub enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
-    Read(#[from] hyper::Error),
+    Read(#[from] hyper1::Error),
     #[error("could not parse the HTTP request body: {0}")]
     Parse(#[from] serde_json::Error),
 }
@@ -437,7 +440,7 @@ struct HttpHeaders {
 }
 
 impl HttpHeaders {
-    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+    fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
         // Determine the output options. Default behaviour is 'false'. Anything that is not
         // strictly 'true' assumed to be false.
         let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -488,9 +491,9 @@ async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
-) -> Result<Response<Body>, SqlOverHttpError> {
+) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&[ctx.protocol])
         .guard();
@@ -528,7 +531,7 @@ async fn handle_inner(
     }
 
     let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body()).await?;
+        let body = request.into_body().collect().await?.to_bytes();
         info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
@@ -596,7 +599,7 @@ async fn handle_inner(
     let body = serde_json::to_string(&result).expect("json serialization should not fail");
     let len = body.len();
     let response = response
-        .body(Body::from(body))
+        .body(Full::new(Bytes::from(body)))
         // only fails if invalid status code or invalid header/values are given.
         // these are not user configurable so it cannot fail dynamically
         .expect("building response payload should not fail");
@@ -639,6 +642,7 @@ impl QueryData {
             }
             // The query was cancelled.
             Either::Right((_cancelled, query)) => {
+                tracing::info!("cancelling query");
                 if let Err(err) = cancel_token.cancel_query(NoTls).await {
                     tracing::error!(?err, "could not cancel query");
                 }
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
deleted file mode 100644
index 33f194dd59..0000000000
--- a/proxy/src/serverless/tls_listener.rs
+++ /dev/null
@@ -1,123 +0,0 @@
-use std::{
-    convert::Infallible,
-    pin::Pin,
-    task::{Context, Poll},
-    time::Duration,
-};
-
-use hyper::server::{accept::Accept, conn::AddrStream};
-use pin_project_lite::pin_project;
-use tokio::{
-    io::{AsyncRead, AsyncWrite},
-    task::JoinSet,
-    time::timeout,
-};
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
-use tracing::{info, warn, Instrument};
-
-use crate::{
-    metrics::TLS_HANDSHAKE_FAILURES,
-    protocol2::{WithClientIp, WithConnectionGuard},
-};
-
-pin_project! {
-    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
-    /// encrypted using TLS.
-    pub(crate) struct TlsListener<A: Accept> {
-        #[pin]
-        listener: A,
-        tls: TlsAcceptor,
-        waiting: JoinSet<Option<TlsStream<A::Conn>>>,
-        timeout: Duration,
-    }
-}
-
-impl<A: Accept> TlsListener<A> {
-    /// Create a `TlsListener` with default options.
-    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
-        TlsListener {
-            listener,
-            tls,
-            waiting: JoinSet::new(),
-            timeout,
-        }
-    }
-}
-
-impl<A> Accept for TlsListener<A>
-where
-    A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
-    A::Error: std::error::Error,
-    A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
-{
-    type Conn = TlsStream<A::Conn>;
-
-    type Error = Infallible;
-
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
-        let mut this = self.project();
-
-        loop {
-            match this.listener.as_mut().poll_accept(cx) {
-                Poll::Pending => break,
-                Poll::Ready(Some(Ok(mut conn))) => {
-                    let t = *this.timeout;
-                    let tls = this.tls.clone();
-                    let span = conn.span.clone();
-                    this.waiting.spawn(async move {
-                        let peer_addr = match conn.inner.wait_for_addr().await {
-                            Ok(Some(addr)) => addr,
-                            Err(e) => {
-                                tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
-                                return None;
-                            }
-                            Ok(None) => conn.inner.inner.remote_addr()
-                        };
-
-                        let accept = tls.accept(conn);
-                        match timeout(t, accept).await {
-                            Ok(Ok(conn)) => {
-                                info!(%peer_addr, "accepted new TLS connection");
-                                Some(conn)
-                            },
-                            // The handshake failed, try getting another connection from the queue
-                            Ok(Err(e)) => {
-                                TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
-                                None
-                            }
-                            // The handshake timed out, try getting another connection from the queue
-                            Err(_) => {
-                                TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
-                                None
-                            }
-                        }
-                    }.instrument(span));
-                }
-                Poll::Ready(Some(Err(e))) => {
-                    tracing::error!("error accepting TCP connection: {e}");
-                    continue;
-                }
-                Poll::Ready(None) => return Poll::Ready(None),
-            }
-        }
-
-        loop {
-            return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
-                // The handshake failed to complete, try getting another connection from the queue
-                Poll::Ready(Some(Ok(None))) => continue,
-                // The handshake panicked or was cancelled. ignore and get another connection
-                Poll::Ready(Some(Err(e))) => {
-                    tracing::warn!("handshake aborted: {e}");
-                    continue;
-                }
-                _ => Poll::Pending,
-            };
-        }
-    }
-}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index bcbd4daa7e..d6e2cc2996 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -63,7 +63,7 @@ scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10", features = ["asm"] }
-smallvec = { version = "1", default-features = false, features = ["write"] }
+smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
@@ -75,7 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive"] }

From 5efe95a008bb6a19ec9676a0c7b1a5516f85e4c1 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:30:09 +0200
Subject: [PATCH 008/157] proxy: fix credentials cache lookup (#7349)

## Problem

Incorrect processing of `-pooler` connections.

## Summary of changes

Fix

TODO: add e2e tests for caching
---
 proxy/src/cache/endpoints.rs       |  5 ++---
 proxy/src/console/provider/neon.rs | 32 ++++++++++++++++++------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 9bc019c2d8..31e3ef6891 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -21,7 +21,7 @@ use crate::{
     metrics::REDIS_BROKEN_MESSAGES,
     rate_limiter::GlobalRateLimiter,
     redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId, Normalize,
+    EndpointId,
 };
 
 #[derive(Deserialize, Debug, Clone)]
@@ -72,9 +72,8 @@ impl EndpointsCache {
         !rejected
     }
     fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        let endpoint = endpoint.normalize();
         if endpoint.is_endpoint() {
-            !self.endpoints.contains(&EndpointIdInt::from(&endpoint))
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
         } else if endpoint.is_branch() {
             !self
                 .branches
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3a0e5609d8..68b91447f9 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -59,7 +59,7 @@ impl Api {
         if !self
             .caches
             .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint)
+            .is_valid(ctx, &user_info.endpoint.normalize())
             .await
         {
             info!("endpoint is not valid, skipping the request");
@@ -186,23 +186,27 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let ep = &user_info.endpoint;
+        let normalized_ep = &user_info.endpoint.normalize();
         let user = &user_info.user;
-        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.normalize().into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -216,8 +220,8 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let ep = &user_info.endpoint;
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -230,16 +234,18 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.normalize().into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches
-                .project_info
-                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
             ctx.set_project_id(project_id);
         }
         Ok((

From 0bb04ebe19c1dd024c7762926ecce166f4259d82 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 12:12:55 +0200
Subject: [PATCH 009/157] Revert "Proxy read ids from redis (#7205)" (#7350)

This reverts commit dbac2d2c473f3648251f0a64e36d066f444dfe00.

## Problem

Proxy pods fails to install in k8s clusters, cplane release blocking.

## Summary of changes

Revert
---
 proxy/src/auth/backend.rs                     |   4 +-
 proxy/src/bin/proxy.rs                        |  15 +-
 proxy/src/cache.rs                            |   1 -
 proxy/src/cache/endpoints.rs                  | 190 ------------------
 proxy/src/config.rs                           |  69 -------
 proxy/src/console/provider.rs                 |  22 +-
 proxy/src/console/provider/neon.rs            |  46 ++---
 proxy/src/context.rs                          |  15 +-
 proxy/src/intern.rs                           |  15 --
 proxy/src/lib.rs                              |  37 ----
 proxy/src/metrics.rs                          |  12 --
 proxy/src/proxy.rs                            |   4 +-
 proxy/src/rate_limiter.rs                     |   2 +-
 proxy/src/rate_limiter/limiter.rs             |  10 +-
 proxy/src/redis/cancellation_publisher.rs     |   6 +-
 .../regress/test_proxy_rate_limiter.py        |  84 ++++++++
 16 files changed, 124 insertions(+), 408 deletions(-)
 delete mode 100644 proxy/src/cache/endpoints.rs
 create mode 100644 test_runner/regress/test_proxy_rate_limiter.py

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 71e9da18bc..e421798067 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
+        let endpoint_int = EndpointIdInt::from(endpoint);
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 9302b31d5c..56a3ef79cd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -189,9 +189,7 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-    /// cache for all valid endpoints
-    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
-    endpoint_cache_config: String,
+
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -403,7 +401,6 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            maintenance_tasks.spawn(api.locks.garbage_collect_worker());
             if let Some(redis_notifications_client) = redis_notifications_client {
                 let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
@@ -413,9 +410,6 @@ async fn main() -> anyhow::Result<()> {
                     args.region.clone(),
                 ));
                 maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-                let cache = api.caches.endpoints_cache.clone();
-                let con = redis_notifications_client.clone();
-                maintenance_tasks.spawn(async move { cache.do_read(con).await });
             }
         }
     }
@@ -495,18 +489,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
-                endpoint_cache_config,
             )));
 
             let config::WakeComputeLockOptions {
@@ -517,9 +507,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch)
+                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
                     .unwrap(),
             ));
+            tokio::spawn(locks.garbage_collect_worker(epoch));
 
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index d1d4087241..fc5f416395 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,5 +1,4 @@
 pub mod common;
-pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
deleted file mode 100644
index 31e3ef6891..0000000000
--- a/proxy/src/cache/endpoints.rs
+++ /dev/null
@@ -1,190 +0,0 @@
-use std::{
-    convert::Infallible,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-};
-
-use dashmap::DashSet;
-use redis::{
-    streams::{StreamReadOptions, StreamReadReply},
-    AsyncCommands, FromRedisValue, Value,
-};
-use serde::Deserialize;
-use tokio::sync::Mutex;
-
-use crate::{
-    config::EndpointCacheConfig,
-    context::RequestMonitoring,
-    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::REDIS_BROKEN_MESSAGES,
-    rate_limiter::GlobalRateLimiter,
-    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId,
-};
-
-#[derive(Deserialize, Debug, Clone)]
-#[serde(rename_all(deserialize = "snake_case"))]
-pub enum ControlPlaneEventKey {
-    EndpointCreated,
-    BranchCreated,
-    ProjectCreated,
-}
-
-pub struct EndpointsCache {
-    config: EndpointCacheConfig,
-    endpoints: DashSet<EndpointIdInt>,
-    branches: DashSet<BranchIdInt>,
-    projects: DashSet<ProjectIdInt>,
-    ready: AtomicBool,
-    limiter: Arc<Mutex<GlobalRateLimiter>>,
-}
-
-impl EndpointsCache {
-    pub fn new(config: EndpointCacheConfig) -> Self {
-        Self {
-            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
-                config.limiter_info.clone(),
-            ))),
-            config,
-            endpoints: DashSet::new(),
-            branches: DashSet::new(),
-            projects: DashSet::new(),
-            ready: AtomicBool::new(false),
-        }
-    }
-    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
-        if !self.ready.load(Ordering::Acquire) {
-            return true;
-        }
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            ctx.set_rejected(self.should_reject(endpoint));
-            return true;
-        }
-        // If the limiter allows, we don't need to check the cache.
-        if self.limiter.lock().await.check() {
-            return true;
-        }
-        let rejected = self.should_reject(endpoint);
-        ctx.set_rejected(rejected);
-        !rejected
-    }
-    fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        if endpoint.is_endpoint() {
-            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
-        } else if endpoint.is_branch() {
-            !self
-                .branches
-                .contains(&BranchIdInt::from(&endpoint.as_branch()))
-        } else {
-            !self
-                .projects
-                .contains(&ProjectIdInt::from(&endpoint.as_project()))
-        }
-    }
-    fn insert_event(&self, key: ControlPlaneEventKey, value: String) {
-        // Do not do normalization here, we expect the events to be normalized.
-        match key {
-            ControlPlaneEventKey::EndpointCreated => {
-                self.endpoints.insert(EndpointIdInt::from(&value.into()));
-            }
-            ControlPlaneEventKey::BranchCreated => {
-                self.branches.insert(BranchIdInt::from(&value.into()));
-            }
-            ControlPlaneEventKey::ProjectCreated => {
-                self.projects.insert(ProjectIdInt::from(&value.into()));
-            }
-        }
-    }
-    pub async fn do_read(
-        &self,
-        mut con: ConnectionWithCredentialsProvider,
-    ) -> anyhow::Result<Infallible> {
-        let mut last_id = "0-0".to_string();
-        loop {
-            self.ready.store(false, Ordering::Release);
-            if let Err(e) = con.connect().await {
-                tracing::error!("error connecting to redis: {:?}", e);
-                continue;
-            }
-            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
-                tracing::error!("error reading from redis: {:?}", e);
-            }
-        }
-    }
-    async fn read_from_stream(
-        &self,
-        con: &mut ConnectionWithCredentialsProvider,
-        last_id: &mut String,
-    ) -> anyhow::Result<()> {
-        tracing::info!("reading endpoints/branches/projects from redis");
-        self.batch_read(
-            con,
-            StreamReadOptions::default().count(self.config.initial_batch_size),
-            last_id,
-            true,
-        )
-        .await?;
-        tracing::info!("ready to filter user requests");
-        self.ready.store(true, Ordering::Release);
-        self.batch_read(
-            con,
-            StreamReadOptions::default()
-                .count(self.config.initial_batch_size)
-                .block(self.config.xread_timeout.as_millis() as usize),
-            last_id,
-            false,
-        )
-        .await
-    }
-    fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> {
-        Ok((serde_json::from_str(key)?, String::from_redis_value(value)?))
-    }
-    async fn batch_read(
-        &self,
-        conn: &mut ConnectionWithCredentialsProvider,
-        opts: StreamReadOptions,
-        last_id: &mut String,
-        return_when_finish: bool,
-    ) -> anyhow::Result<()> {
-        let mut total: usize = 0;
-        loop {
-            let mut res: StreamReadReply = conn
-                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
-                .await?;
-            if res.keys.len() != 1 {
-                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
-            }
-
-            let res = res.keys.pop().expect("Checked length above");
-
-            if return_when_finish && res.ids.len() <= self.config.default_batch_size {
-                break;
-            }
-            for x in res.ids {
-                total += 1;
-                for (k, v) in x.map {
-                    let (key, value) = match Self::parse_key_value(&k, &v) {
-                        Ok(x) => x,
-                        Err(e) => {
-                            REDIS_BROKEN_MESSAGES
-                                .with_label_values(&[&self.config.stream_name])
-                                .inc();
-                            tracing::error!("error parsing key-value {k}-{v:?}: {e:?}");
-                            continue;
-                        }
-                    };
-                    self.insert_event(key, value);
-                }
-                if total.is_power_of_two() {
-                    tracing::debug!("endpoints read {}", total);
-                }
-                *last_id = x.id;
-            }
-        }
-        tracing::info!("read {} endpoints/branches/projects from redis", total);
-        Ok(())
-    }
-}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 3bdfb3cfad..fc490c7348 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -313,75 +313,6 @@ impl CertResolver {
     }
 }
 
-#[derive(Debug)]
-pub struct EndpointCacheConfig {
-    /// Batch size to receive all endpoints on the startup.
-    pub initial_batch_size: usize,
-    /// Batch size to receive endpoints.
-    pub default_batch_size: usize,
-    /// Timeouts for the stream read operation.
-    pub xread_timeout: Duration,
-    /// Stream name to read from.
-    pub stream_name: String,
-    /// Limiter info (to distinguish when to enable cache).
-    pub limiter_info: Vec<RateBucketInfo>,
-    /// Disable cache.
-    /// If true, cache is ignored, but reports all statistics.
-    pub disable_cache: bool,
-}
-
-impl EndpointCacheConfig {
-    /// Default options for [`crate::console::provider::NodeInfoCache`].
-    /// Notice that by default the limiter is empty, which means that cache is disabled.
-    pub const CACHE_DEFAULT_OPTIONS: &'static str =
-        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s";
-
-    /// Parse cache options passed via cmdline.
-    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
-    fn parse(options: &str) -> anyhow::Result<Self> {
-        let mut initial_batch_size = None;
-        let mut default_batch_size = None;
-        let mut xread_timeout = None;
-        let mut stream_name = None;
-        let mut limiter_info = vec![];
-        let mut disable_cache = false;
-
-        for option in options.split(',') {
-            let (key, value) = option
-                .split_once('=')
-                .with_context(|| format!("bad key-value pair: {option}"))?;
-
-            match key {
-                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
-                "default_batch_size" => default_batch_size = Some(value.parse()?),
-                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
-                "stream_name" => stream_name = Some(value.to_string()),
-                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
-                "disable_cache" => disable_cache = value.parse()?,
-                unknown => bail!("unknown key: {unknown}"),
-            }
-        }
-        RateBucketInfo::validate(&mut limiter_info)?;
-
-        Ok(Self {
-            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
-            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
-            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
-            stream_name: stream_name.context("missing `stream_name`")?,
-            disable_cache,
-            limiter_info,
-        })
-    }
-}
-
-impl FromStr for EndpointCacheConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(options: &str) -> Result<Self, Self::Err> {
-        let error = || format!("failed to parse endpoint cache options '{options}'");
-        Self::parse(options).with_context(error)
-    }
-}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index ee2bc866ab..f7d621fb12 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,15 +8,15 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
+    config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{convert::Infallible, sync::Arc, time::Duration};
+use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -416,15 +416,12 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
-    /// List of all valid endpoints.
-    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
-        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -434,7 +431,6 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
-            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
@@ -445,7 +441,6 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
-    epoch: std::time::Duration,
     registered: prometheus::IntCounter,
     unregistered: prometheus::IntCounter,
     reclamation_lag: prometheus::Histogram,
@@ -458,7 +453,6 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
-        epoch: std::time::Duration,
     ) -> prometheus::Result<Self> {
         let registered = prometheus::IntCounter::with_opts(
             prometheus::Opts::new(
@@ -503,7 +497,6 @@ impl ApiLocks {
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
-            epoch,
             lock_acquire_lag,
             registered,
             unregistered,
@@ -543,9 +536,12 @@ impl ApiLocks {
         })
     }
 
-    pub async fn garbage_collect_worker(&self) -> anyhow::Result<Infallible> {
-        let mut interval =
-            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
+    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+        if self.permits == 0 {
+            return;
+        }
+
+        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 68b91447f9..1a3e2ca795 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 };
 use crate::{
     auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
-    Normalize,
 };
 use crate::{
     cache::Cached,
@@ -24,7 +23,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    pub locks: &'static ApiLocks,
+    locks: &'static ApiLocks,
     jwt: String,
 }
 
@@ -56,15 +55,6 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint.normalize())
-            .await
-        {
-            info!("endpoint is not valid, skipping the request");
-            return Ok(AuthInfo::default());
-        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -91,9 +81,7 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => {
-                        return Ok(AuthInfo::default());
-                    }
+                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -186,27 +174,23 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
+        let ep = &user_info.endpoint;
         let user = &user_info.user;
-        if let Some(role_secret) = self
-            .caches
-            .project_info
-            .get_role_secret(normalized_ep, user)
-        {
+        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                normalized_ep_int,
+                ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                normalized_ep_int,
+                ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -220,8 +204,8 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let normalized_ep = &user_info.endpoint.normalize();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+        let ep = &user_info.endpoint;
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -234,18 +218,16 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                normalized_ep_int,
+                ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches.project_info.insert_allowed_ips(
-                project_id,
-                normalized_ep_int,
-                allowed_ips.clone(),
-            );
+            self.caches
+                .project_info
+                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
             ctx.set_project_id(project_id);
         }
         Ok((
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 85544f1d65..fec95f4722 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,9 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{
-        bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS,
-    },
+    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
     DbName, EndpointId, RoleName,
 };
 
@@ -52,8 +50,6 @@ pub struct RequestMonitoring {
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
-    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -97,7 +93,6 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            rejected: false,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -118,10 +113,6 @@ impl RequestMonitoring {
         )
     }
 
-    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = rejected;
-    }
-
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -187,10 +178,6 @@ impl RequestMonitoring {
 
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
-        let outcome = if self.success { "success" } else { "failure" };
-        NUM_INVALID_ENDPOINTS
-            .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome])
-            .inc();
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index e38135dd22..a6519bdff9 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,11 +160,6 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
-impl From<EndpointId> for EndpointIdInt {
-    fn from(value: EndpointId) -> Self {
-        EndpointIdTag::get_interner().get_or_intern(&value)
-    }
-}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -180,11 +175,6 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
-impl From<BranchId> for BranchIdInt {
-    fn from(value: BranchId) -> Self {
-        BranchIdTag::get_interner().get_or_intern(&value)
-    }
-}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -200,11 +190,6 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
-impl From<ProjectId> for ProjectIdInt {
-    fn from(value: ProjectId) -> Self {
-        ProjectIdTag::get_interner().get_or_intern(&value)
-    }
-}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 3f6d985fe8..da7c7f3ed2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,24 +127,6 @@ macro_rules! smol_str_wrapper {
     };
 }
 
-const POOLER_SUFFIX: &str = "-pooler";
-
-pub trait Normalize {
-    fn normalize(&self) -> Self;
-}
-
-impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
-    fn normalize(&self) -> Self {
-        if self.as_ref().ends_with(POOLER_SUFFIX) {
-            let mut s = self.as_ref().to_string();
-            s.truncate(s.len() - POOLER_SUFFIX.len());
-            s.into()
-        } else {
-            self.clone()
-        }
-    }
-}
-
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -158,22 +140,3 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
-
-// Endpoints are a bit tricky. Rare they might be branches or projects.
-impl EndpointId {
-    pub fn is_endpoint(&self) -> bool {
-        self.0.starts_with("ep-")
-    }
-    pub fn is_branch(&self) -> bool {
-        self.0.starts_with("br-")
-    }
-    pub fn is_project(&self) -> bool {
-        !self.is_endpoint() && !self.is_branch()
-    }
-    pub fn as_branch(&self) -> BranchId {
-        BranchId(self.0.clone())
-    }
-    pub fn as_project(&self) -> ProjectId {
-        ProjectId(self.0.clone())
-    }
-}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f299313e0a..59ee899c08 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -169,18 +169,6 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static NUM_INVALID_ENDPOINTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_invalid_endpoints_total",
-        "Number of invalid endpoints (per protocol, per rejected).",
-        // http/ws/tcp, true/false, success/failure
-        // TODO(anna): the last dimension is just a proxy to what we actually want to measure.
-        // We need to measure whether the endpoint was found by cplane or not.
-        &["protocol", "rejected", "outcome"],
-    )
-    .unwrap()
-});
-
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
 pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 166e761a4e..6051c0a812 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use crate::{
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey, Normalize,
+    EndpointCacheKey,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
+        if !endpoint_rate_limiter.check(ep, 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index a3b83e5e50..13dffffca0 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 0503deb311..f590896dd9 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -24,13 +24,13 @@ use super::{
     RateLimiterConfig,
 };
 
-pub struct GlobalRateLimiter {
+pub struct RedisRateLimiter {
     data: Vec<RateBucket>,
-    info: Vec<RateBucketInfo>,
+    info: &'static [RateBucketInfo],
 }
 
-impl GlobalRateLimiter {
-    pub fn new(info: Vec<RateBucketInfo>) -> Self {
+impl RedisRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -50,7 +50,7 @@ impl GlobalRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(&self.info)
+            .zip(self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 7baf104374..422789813c 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: GlobalRateLimiter,
+    limiter: RedisRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: GlobalRateLimiter::new(info.into()),
+            limiter: RedisRateLimiter::new(info),
         })
     }
 
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
new file mode 100644
index 0000000000..f39f0cad07
--- /dev/null
+++ b/test_runner/regress/test_proxy_rate_limiter.py
@@ -0,0 +1,84 @@
+import asyncio
+import time
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+from fixtures.neon_fixtures import (
+    PSQL,
+    NeonProxy,
+)
+from fixtures.port_distributor import PortDistributor
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.response import Response
+
+
+def waiting_handler(status_code: int) -> Response:
+    # wait more than timeout to make sure that both (two) connections are open.
+    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
+    time.sleep(2)
+    return Response(status=status_code)
+
+
+@pytest.fixture(scope="function")
+def proxy_with_rate_limit(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    httpserver_listen_address,
+    test_output_dir: Path,
+) -> Iterator[NeonProxy]:
+    """Neon proxy that routes directly to vanilla postgres."""
+
+    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
+    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+    (host, port) = httpserver_listen_address
+    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    with NeonProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        proxy_port=proxy_port,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
+        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
+@pytest.mark.asyncio
+async def test_proxy_rate_limit(
+    httpserver: HTTPServer,
+    proxy_with_rate_limit: NeonProxy,
+):
+    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
+    # mock control plane service
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: Response(status=200)
+    )
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: waiting_handler(429)
+    )
+    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
+        lambda _: waiting_handler(500)
+    )
+
+    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
+    f = await psql.run("select 42;")
+    await proxy_with_rate_limit.find_auth_link(uri, f)
+    # Limit should be 2.
+
+    # Run two queries in parallel.
+    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
+    await proxy_with_rate_limit.find_auth_link(uri, f1)
+    await proxy_with_rate_limit.find_auth_link(uri, f2)
+
+    # Now limit should be 0.
+    f = await psql.run("select 42;")
+    await proxy_with_rate_limit.find_auth_link(uri, f)
+
+    # There last query shouldn't reach the http-server.
+    assert httpserver.assertions == []

From f86845f64b9576d05b06de9c33dec3c6be19c47c Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 10 Apr 2024 06:13:48 -0700
Subject: [PATCH 010/157] compute_ctl: Auto-set dynamic_shared_memory_type
 (#7348)

Part of neondatabase/cloud#12047.

The basic idea is that for our VMs, we want to enable swap and disable
Linux memory overcommit. Alongside these, we should set postgres'
dynamic_shared_memory_type to mmap, but we want to avoid setting it to
mmap if swap is not enabled.

Implementing this in the control plane would be fiddly, but it's
relatively straightforward to add to compute_ctl.
---
 compute_tools/src/config.rs     | 25 +++++++++++++++++++++++--
 compute_tools/src/pg_helpers.rs |  2 +-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index f1fd8637f5..89c866b20c 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;
 
 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
+use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
         }
     }
 
+    if cfg!(target_os = "linux") {
+        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
+        // disabled), then the control plane has enabled swap and we should set
+        // dynamic_shared_memory_type = 'mmap'.
+        //
+        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
+        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
+            // ignore any errors - they may be expected to occur under certain situations (e.g. when
+            // not running in Linux).
+            .unwrap_or_else(|_| String::new());
+        if overcommit_memory_contents.trim() == "2" {
+            let opt = GenericOption {
+                name: "dynamic_shared_memory_type".to_owned(),
+                value: Some("mmap".to_owned()),
+                vartype: "enum".to_owned(),
+            };
+
+            write!(file, "{}", opt.to_pg_setting())?;
+        }
+    }
+
     // If there are any extra options in the 'settings' field, append those
     if spec.cluster.settings.is_some() {
         writeln!(file, "# Managed by compute_ctl: begin")?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 5deb50d6b7..fa0822748b 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
     format!("'{}'", res)
 }
 
-trait GenericOptionExt {
+pub trait GenericOptionExt {
     fn to_pg_option(&self) -> String;
     fn to_pg_setting(&self) -> String;
 }

From d47e4a2a4148ff0b6467d5bda504401b90bb00da Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 11 Apr 2024 07:47:45 +0300
Subject: [PATCH 011/157] Remember last written LSN when it is first requested
 (#7343)

## Problem

See https://neondb.slack.com/archives/C03QLRH7PPD/p1712529369520409

In case of statements CREATE TABLE AS SELECT... or INSERT FROM SELECT...
we are fetching data from source table and storing it in destination
table. It cause problems with prefetch last-written-lsn is known for the
pages of source table
(which for example happens after compute restart). In this case we get
get global value of last-written-lsn which is changed frequently as far
as we are writing pages of destination table. As a result request-isn
for the prefetch and request-let when this page is actually needed are
different and we got exported prefetch request. So it actually disarms
prefetch.


## Summary of changes

Proposed simple patch stores last-written LSN for the page when it is
not found. So next time we will request last-written LSN for this page,
we will get the same value (certainly if the page was not changed).

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a7b4c66156..d9149dc59a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf
+Subproject commit d9149dc59abcbeeb26293707509aef51752db28f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 64b8c7bccc..85d809c124 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed
+Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3946b2e2ea..261497dd63 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6
+Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 75dc095168..dfc0aa04c3 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6",
-  "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed",
-  "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf"
+  "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5",
+  "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb",
+  "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f"
 }

From db72543f4d4d3300d48375db177c8ee598ed4049 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 11 Apr 2024 12:31:27 +0200
Subject: [PATCH 012/157] Reenable test_forward_compatibility (#7358)

It was disabled due to https://github.com/neondatabase/neon/pull/6530
breaking forward compatiblity.
Now that we have deployed it to production, we can reenable the test
---
 test_runner/regress/test_compatibility.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 208263a22a..ddad98a5fa 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -192,9 +192,6 @@ def test_backward_compatibility(
     assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
-# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530
-# The test is disabled until the next release deployment
-@pytest.mark.xfail
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")

From 1628b5b145b335e4a26fcdb1ccdf4263ab8745cf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 11 Apr 2024 17:14:09 +0300
Subject: [PATCH 013/157] compute hook: use shared client with explicit timeout
 (#7359)

## Problem

We are seeing some mysterious long waits when sending requests.

## Summary of changes

- To eliminate risk that we are incurring some unreasonable overheads
from setup, e.g. DNS, use a single Client (internally a pool) instead of
repeatedly constructing a fresh one.
- To make it clearer where a timeout is occurring, apply a 10 second
timeout to requests as we send them.
---
 storage_controller/src/compute_hook.rs | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index eb0c4472e4..1ed8998713 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -17,6 +17,8 @@ use crate::service::Config;
 
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
+const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub(crate) const API_CONCURRENCY: usize = 32;
 
 struct UnshardedComputeHookTenant {
@@ -242,6 +244,10 @@ pub(super) struct ComputeHook {
 
     // This lock is only used in testing enviroments, to serialize calls into neon_lock
     neon_local_lock: tokio::sync::Mutex<()>,
+
+    // We share a client across all notifications to enable connection re-use etc when
+    // sending large numbers of notifications
+    client: reqwest::Client,
 }
 
 impl ComputeHook {
@@ -251,12 +257,18 @@ impl ComputeHook {
             .clone()
             .map(|jwt| format!("Bearer {}", jwt));
 
+        let client = reqwest::ClientBuilder::new()
+            .timeout(NOTIFY_REQUEST_TIMEOUT)
+            .build()
+            .expect("Failed to construct HTTP client");
+
         Self {
             state: Default::default(),
             config,
             authorization_header,
             neon_local_lock: Default::default(),
             api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
+            client,
         }
     }
 
@@ -310,12 +322,11 @@ impl ComputeHook {
 
     async fn do_notify_iteration(
         &self,
-        client: &reqwest::Client,
         url: &String,
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = client.request(Method::PUT, url);
+        let req = self.client.request(Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
@@ -381,8 +392,6 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let client = reqwest::Client::new();
-
         // We hold these semaphore units across all retries, rather than only across each
         // HTTP request: this is to preserve fairness and avoid a situation where a retry might
         // time out waiting for a semaphore.
@@ -394,7 +403,7 @@ impl ComputeHook {
             .map_err(|_| NotifyError::ShuttingDown)?;
 
         backoff::retry(
-            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
+            || self.do_notify_iteration(url, reconfigure_request, cancel),
             |e| {
                 matches!(
                     e,

From 99a56b56064264fd73a7dc3ce5606469725cc4cb Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 11 Apr 2024 15:23:08 +0100
Subject: [PATCH 014/157] CI(build-build-tools-image): Do not cancel concurrent
 workflows  (#7226)

## Problem

`build-build-tools-image` workflow is designed to be run only in one
example per the whole repository. Currently, the job gets cancelled if a
newer one is scheduled, here's an example:
https://github.com/neondatabase/neon/actions/runs/8419610607

## Summary of changes
- Explicitly set `cancel-in-progress: false` for all jobs that aren't
supposed to be cancelled
---
 .github/workflows/approved-for-ci-run.yml     | 1 +
 .github/workflows/build-build-tools-image.yml | 1 +
 .github/workflows/pin-build-tools-image.yml   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 69c48d86b9..ab616d17e2 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,6 +18,7 @@ on:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: false
 
 env:
   GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 251423e701..c527cef1ac 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,6 +21,7 @@ defaults:
 
 concurrency:
   group: build-build-tools-image-${{ inputs.image-tag }}
+  cancel-in-progress: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index c941692066..d495a158e8 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,6 +20,7 @@ defaults:
 
 concurrency:
   group: pin-build-tools-image-${{ inputs.from-tag }}
+  cancel-in-progress: false
 
 permissions: {}
 

From 5299f917d6d2be5d87b56d236342d48682a5c9f4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 11 Apr 2024 17:26:01 +0100
Subject: [PATCH 015/157] proxy: replace prometheus with measured (#6717)

## Problem

My benchmarks show that prometheus is not very good.
https://github.com/conradludgate/measured

We're already using it in storage_controller and it seems to be working
well.

## Summary of changes

Replace prometheus with my new measured crate in proxy only.

Apologies for the large diff. I tried to keep it as minimal as I could.
The label types add a bit of boiler plate (but reduce the chance we
mistype the labels), and some of our custom metrics like CounterPair and
HLL needed to be rewritten.
---
 Cargo.lock                            |  13 +-
 Cargo.toml                            |   4 +-
 libs/metrics/src/hll.rs               | 395 ++++------------
 libs/metrics/src/lib.rs               | 172 ++++++-
 proxy/Cargo.toml                      |   1 +
 proxy/src/auth/backend.rs             |  10 +-
 proxy/src/auth/credentials.rs         |  21 +-
 proxy/src/bin/pg_sni_router.rs        |   7 +-
 proxy/src/bin/proxy.rs                |  40 +-
 proxy/src/cancellation.rs             |  34 +-
 proxy/src/compute.rs                  |   9 +-
 proxy/src/console/messages.rs         |   5 +-
 proxy/src/console/provider.rs         |  63 +--
 proxy/src/console/provider/neon.rs    |  32 +-
 proxy/src/context.rs                  |  24 +-
 proxy/src/context/parquet.rs          |   2 +-
 proxy/src/error.rs                    |   9 +-
 proxy/src/http.rs                     |  21 +-
 proxy/src/http/health_server.rs       |  89 +++-
 proxy/src/jemalloc.rs                 | 178 +++----
 proxy/src/metrics.rs                  | 658 +++++++++++++++-----------
 proxy/src/proxy.rs                    |  30 +-
 proxy/src/proxy/connect_compute.rs    |   8 +-
 proxy/src/proxy/passthrough.rs        |  16 +-
 proxy/src/proxy/wake_compute.rs       |  31 +-
 proxy/src/rate_limiter/limiter.rs     |  30 +-
 proxy/src/redis/notifications.rs      |  10 +-
 proxy/src/serverless.rs               |  28 +-
 proxy/src/serverless/conn_pool.rs     |  51 +-
 proxy/src/serverless/sql_over_http.rs |  27 +-
 proxy/src/serverless/websocket.rs     |   9 +-
 proxy/src/stream.rs                   |   4 +-
 32 files changed, 1127 insertions(+), 904 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bdf2b08c5c..6faf4b72f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2932,9 +2932,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
  "bytes",
  "crossbeam-utils",
@@ -2950,9 +2950,9 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
@@ -2962,9 +2962,9 @@ dependencies = [
 
 [[package]]
 name = "measured-process"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
  "libc",
  "measured",
@@ -4322,6 +4322,7 @@ dependencies = [
  "itertools",
  "lasso",
  "md5",
+ "measured",
  "metrics",
  "native-tls",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index feea17ab05..8310d2d522 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,8 +107,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.20", features=["lasso"] }
-measured-process = { version = "0.0.20" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index dfb4461ce9..f53511ab5c 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,14 +7,19 @@
 //! use significantly less memory than this, but can only approximate the cardinality.
 
 use std::{
-    collections::HashMap,
-    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
-    sync::{atomic::AtomicU8, Arc, RwLock},
+    hash::{BuildHasher, BuildHasherDefault, Hash},
+    sync::atomic::AtomicU8,
 };
 
-use prometheus::{
-    core::{self, Describer},
-    proto, Opts,
+use measured::{
+    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
+    text::TextEncoder,
+    LabelGroup,
 };
 use twox_hash::xxh3;
 
@@ -93,203 +98,25 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLogVec<const N: usize> {
-    core: Arc<HyperLogLogVecCore<N>>,
+pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
+pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
+
+pub struct HyperLogLogState<const N: usize> {
+    shards: [AtomicU8; N],
 }
-
-struct HyperLogLogVecCore<const N: usize> {
-    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
-    pub desc: core::Desc,
-    pub opts: Opts,
-}
-
-impl<const N: usize> core::Collector for HyperLogLogVec<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
-    }
-
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        for child in self.core.children.read().unwrap().values() {
-            child.core.collect_into(&mut metrics);
-        }
-        m.set_metric(metrics);
-
-        vec![m]
+impl<const N: usize> Default for HyperLogLogState<N> {
+    fn default() -> Self {
+        #[allow(clippy::declare_interior_mutable_const)]
+        const ZERO: AtomicU8 = AtomicU8::new(0);
+        Self { shards: [ZERO; N] }
     }
 }
 
-impl<const N: usize> HyperLogLogVec<N> {
-    /// Create a new [`HyperLogLogVec`] based on the provided
-    /// [`Opts`] and partitioned by the given label names. At least one label name must be
-    /// provided.
-    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
-        let opts = opts.variable_labels(variable_names);
-
-        let desc = opts.describe()?;
-        let v = HyperLogLogVecCore {
-            children: RwLock::new(HashMap::default()),
-            desc,
-            opts,
-        };
-
-        Ok(Self { core: Arc::new(v) })
-    }
-
-    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
-    /// of label values (same order as the VariableLabels in Desc). If that combination of
-    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
-    ///
-    /// An error is returned if the number of label values is not the same as the
-    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        self.core.get_metric_with_label_values(vals)
-    }
-
-    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
-    /// occurs.
-    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
-        self.get_metric_with_label_values(vals).unwrap()
-    }
+impl<const N: usize> MetricType for HyperLogLogState<N> {
+    type Metadata = ();
 }
 
-impl<const N: usize> HyperLogLogVecCore<N> {
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let h = self.hash_label_values(vals)?;
-
-        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
-            return Ok(metric);
-        }
-
-        self.get_or_create_metric(h, vals)
-    }
-
-    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
-        if vals.len() != self.desc.variable_labels.len() {
-            return Err(prometheus::Error::InconsistentCardinality {
-                expect: self.desc.variable_labels.len(),
-                got: vals.len(),
-            });
-        }
-
-        let mut h = xxh3::Hash64::default();
-        for val in vals {
-            h.write(val.as_bytes());
-        }
-
-        Ok(h.finish())
-    }
-
-    fn get_or_create_metric(
-        &self,
-        hash: u64,
-        label_values: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let mut children = self.children.write().unwrap();
-        // Check exist first.
-        if let Some(metric) = children.get(&hash).cloned() {
-            return Ok(metric);
-        }
-
-        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
-        children.insert(hash, metric.clone());
-        Ok(metric)
-    }
-}
-
-/// HLL is a probabilistic cardinality measure.
-///
-/// How to use this time-series for a metric name `my_metrics_total_hll`:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// If you want an estimate over time, you can use the following query:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (
-///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
-///             ) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// In the case of low cardinality, you might want to use the linear counting approximation:
-///
-/// ```promql
-/// # LinearCounting(m, V) = m log (m / V)
-/// shards_count * ln(shards_count /
-///     # calculate V = how many shards contain a 0
-///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
-/// )
-/// ```
-///
-/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLog<const N: usize> {
-    core: Arc<HyperLogLogCore<N>>,
-}
-
-impl<const N: usize> HyperLogLog<N> {
-    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
-    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let opts = Opts::new(name, help);
-        Self::with_opts(opts)
-    }
-
-    /// Create a [`HyperLogLog`] with the `opts` options.
-    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
-        Self::with_opts_and_label_values(&opts, &[])
-    }
-
-    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
-        let desc = opts.describe()?;
-        let labels = make_label_pairs(&desc, label_values)?;
-
-        let v = HyperLogLogCore {
-            shards: [0; N].map(AtomicU8::new),
-            desc,
-            labels,
-        };
-        Ok(Self { core: Arc::new(v) })
-    }
-
+impl<const N: usize> HyperLogLogState<N> {
     pub fn measure(&self, item: &impl Hash) {
         // changing the hasher will break compatibility with previous measurements.
         self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
         let p = N.ilog2() as u8;
         let j = hash & (N as u64 - 1);
         let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-struct HyperLogLogCore<const N: usize> {
-    shards: [AtomicU8; N],
-    desc: core::Desc,
-    labels: Vec<proto::LabelPair>,
-}
-
-impl<const N: usize> core::Collector for HyperLogLog<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
+        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
     }
 
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        self.core.collect_into(&mut metrics);
-        m.set_metric(metrics);
-
-        vec![m]
-    }
-}
-
-impl<const N: usize> HyperLogLogCore<N> {
-    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
-        self.shards.iter().enumerate().for_each(|(i, x)| {
-            let mut shard_label = proto::LabelPair::default();
-            shard_label.set_name("hll_shard".to_owned());
-            shard_label.set_value(format!("{i}"));
-
+    fn take_sample(&self) -> [u8; N] {
+        self.shards.each_ref().map(|x| {
             // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
 
             // This seems like it would be a race condition,
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
 
             // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
             // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
-
-            let mut m = proto::Metric::default();
-            let mut c = proto::Gauge::default();
-            c.set_value(v as f64);
-            m.set_gauge(c);
-
-            let mut labels = Vec::with_capacity(self.labels.len() + 1);
-            labels.extend_from_slice(&self.labels);
-            labels.push(shard_label);
-
-            m.set_label(labels);
-            metrics.push(m);
+            x.swap(0, std::sync::atomic::Ordering::Relaxed)
         })
     }
 }
-
-fn make_label_pairs(
-    desc: &core::Desc,
-    label_values: &[&str],
-) -> prometheus::Result<Vec<proto::LabelPair>> {
-    if desc.variable_labels.len() != label_values.len() {
-        return Err(prometheus::Error::InconsistentCardinality {
-            expect: desc.variable_labels.len(),
-            got: label_values.len(),
-        });
+impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
+    for HyperLogLogState<N>
+{
+    fn write_type(
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        enc.write_type(&name, measured::text::MetricType::Gauge)
     }
+    fn collect_into(
+        &self,
+        _: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        struct I64(i64);
+        impl LabelValue for I64 {
+            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
+                v.write_int(self.0)
+            }
+        }
 
-    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
-    if total_len == 0 {
-        return Ok(vec![]);
-    }
+        struct HllShardLabel {
+            hll_shard: i64,
+        }
 
-    if desc.variable_labels.is_empty() {
-        return Ok(desc.const_label_pairs.clone());
-    }
+        impl LabelGroup for HllShardLabel {
+            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+                const LE: &LabelName = LabelName::from_str("hll_shard");
+                v.write_value(LE, &I64(self.hll_shard));
+            }
+        }
 
-    let mut label_pairs = Vec::with_capacity(total_len);
-    for (i, n) in desc.variable_labels.iter().enumerate() {
-        let mut label_pair = proto::LabelPair::default();
-        label_pair.set_name(n.clone());
-        label_pair.set_value(label_values[i].to_owned());
-        label_pairs.push(label_pair);
+        self.take_sample()
+            .into_iter()
+            .enumerate()
+            .try_for_each(|(hll_shard, val)| {
+                enc.write_metric_value(
+                    name.by_ref(),
+                    labels.by_ref().compose_with(HllShardLabel {
+                        hll_shard: hll_shard as i64,
+                    }),
+                    MetricValue::Int(val as i64),
+                )
+            })
     }
-
-    for label_pair in &desc.const_label_pairs {
-        label_pairs.push(label_pair.clone());
-    }
-    label_pairs.sort();
-    Ok(label_pairs)
 }
 
 #[cfg(test)]
 mod tests {
     use std::collections::HashSet;
 
-    use prometheus::{proto, Opts};
+    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
     use rand::{rngs::StdRng, Rng, SeedableRng};
     use rand_distr::{Distribution, Zipf};
 
     use crate::HyperLogLogVec;
 
-    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
-        let mut metrics = vec![];
-        hll.core
-            .children
-            .read()
-            .unwrap()
-            .values()
-            .for_each(|c| c.core.collect_into(&mut metrics));
-        metrics
+    #[derive(FixedCardinalityLabel, Clone, Copy)]
+    #[label(singleton = "x")]
+    enum Label {
+        A,
+        B,
     }
-    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+
+    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
+        // cannot go through the `hll.collect_family_into` interface yet...
+        // need to see if I can fix the conflicting impls problem in measured.
+        (
+            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
+            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
+        )
+    }
+
+    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
         let mut buckets = [0.0; 32];
-        for metric in metrics.chunks_exact(32) {
-            if filter(&metric[0]) {
-                for (i, m) in metric.iter().enumerate() {
-                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
-                }
+        for &sample in samples {
+            for (i, m) in sample.into_iter().enumerate() {
+                buckets[i] = f64::max(buckets[i], m as f64);
             }
         }
 
@@ -437,7 +238,7 @@ mod tests {
     }
 
     fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
 
         let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
         let mut set_a = HashSet::new();
@@ -445,18 +246,20 @@ mod tests {
 
         for x in iter.by_ref().take(n) {
             set_a.insert(x.to_bits());
-            hll.with_label_values(&["a"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::A))
+                .measure(&x.to_bits());
         }
         for x in iter.by_ref().take(n) {
             set_b.insert(x.to_bits());
-            hll.with_label_values(&["b"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::B))
+                .measure(&x.to_bits());
         }
         let merge = &set_a | &set_b;
 
-        let metrics = collect(&hll);
-        let len = get_cardinality(&metrics, |_| true);
-        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
-        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+        let (a, b) = collect(&hll);
+        let len = get_cardinality(&[a, b]);
+        let len_a = get_cardinality(&[a]);
+        let len_b = get_cardinality(&[b]);
 
         ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
     }
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 6cff28c0ca..2cf3cdeaa7 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -5,7 +5,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 use measured::{
-    label::{LabelGroupVisitor, LabelName, NoLabels},
+    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
     metric::{
         counter::CounterState,
         gauge::GaugeState,
@@ -40,7 +40,7 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
 
@@ -421,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
 
 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
+
+pub trait CounterPairAssoc {
+    const INC_NAME: &'static MetricName;
+    const DEC_NAME: &'static MetricName;
+
+    const INC_HELP: &'static str;
+    const DEC_HELP: &'static str;
+
+    type LabelGroupSet: LabelGroupSet;
+}
+
+pub struct CounterPairVec<A: CounterPairAssoc> {
+    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Default for CounterPairVec<A>
+where
+    A::LabelGroupSet: Default,
+{
+    fn default() -> Self {
+        Self {
+            vec: Default::default(),
+        }
+    }
+}
+
+impl<A: CounterPairAssoc> CounterPairVec<A> {
+    pub fn guard(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> MeasuredCounterPairGuard<'_, A> {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+        MeasuredCounterPairGuard { vec: &self.vec, id }
+    }
+    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+    }
+    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).dec.inc();
+    }
+    pub fn remove_metric(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> Option<MeasuredCounterPairState> {
+        let id = self.vec.with_labels(labels);
+        self.vec.remove_metric(id)
+    }
+}
+
+impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
+where
+    T: ::measured::metric::group::Encoding,
+    A: CounterPairAssoc,
+    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        // write decrement first to avoid a race condition where inc - dec < 0
+        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
+        self.vec
+            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
+
+        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
+        self.vec
+            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
+
+        Ok(())
+    }
+}
+
+#[derive(MetricGroup, Default)]
+pub struct MeasuredCounterPairState {
+    pub inc: CounterState,
+    pub dec: CounterState,
+}
+
+impl measured::metric::MetricType for MeasuredCounterPairState {
+    type Metadata = ();
+}
+
+pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
+    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+    id: measured::metric::LabelId<A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
+    fn drop(&mut self) {
+        self.vec.get_metric(self.id).dec.inc();
+    }
+}
+
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
+struct Inc<T>(T);
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
+struct Dec<T>(T);
+
+impl<T: Encoding> Encoding for Inc<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Inc<T>,
+    ) -> Result<(), T::Err> {
+        self.inc.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
+
+impl<T: Encoding> Encoding for Dec<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+/// Write the dec counter to the encoder
+impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Dec<T>,
+    ) -> Result<(), T::Err> {
+        self.dec.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 12bd67ea36..6b8f2ecbf4 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -44,6 +44,7 @@ ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
+measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index e421798067..229d499e30 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -13,7 +13,7 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
 use crate::intern::EndpointIdInt;
-use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
+use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
@@ -210,8 +210,12 @@ impl AuthenticationConfig {
                 enabled = self.rate_limiter_enabled,
                 "rate limiting authentication"
             );
-            AUTH_RATE_LIMIT_HITS.inc();
-            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
+            Metrics::get().proxy.requests_auth_rate_limits_total.inc();
+            Metrics::get()
+                .proxy
+                .endpoints_auth_rate_limits
+                .get_metric()
+                .measure(endpoint);
 
             if self.rate_limiter_enabled {
                 return Err(auth::AuthError::too_many_connections());
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 89773aa1ff..783a1a5a21 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -4,7 +4,7 @@ use crate::{
     auth::password_hack::parse_endpoint_param,
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
+    metrics::{Metrics, SniKind},
     proxy::NeonOptions,
     serverless::SERVERLESS_DRIVER_SNI,
     EndpointId, RoleName,
@@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint {
             ctx.set_endpoint_id(ep.clone());
         }
 
+        let metrics = Metrics::get();
         info!(%user, "credentials");
         if sni.is_some() {
             info!("Connection with sni");
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["sni"])
-                .inc();
+            metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
         } else if endpoint.is_some() {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["no_sni"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::NoSni);
             info!("Connection without sni");
         } else {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["password_hack"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::PasswordHack);
             info!("Connection with password hack");
         }
 
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index c28814b1c8..58737efe46 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -176,7 +176,12 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                let ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr.ip(),
+                    proxy::metrics::Protocol::SniRouter,
+                    "sni",
+                );
                 handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
             }
             .unwrap_or_else(|e| {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 56a3ef79cd..3392c21075 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -18,7 +18,8 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
-use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
+use proxy::http::health_server::AppMetrics;
+use proxy::metrics::Metrics;
 use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
@@ -249,14 +250,18 @@ async fn main() -> anyhow::Result<()> {
 
     info!("Version: {GIT_VERSION}");
     info!("Build_tag: {BUILD_TAG}");
-    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
 
-    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
-        Ok(t) => {
-            t.start();
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
         }
-        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
-    }
+    };
 
     let args = ProxyCliArgs::parse();
     let config = build_config(&args)?;
@@ -349,7 +354,7 @@ async fn main() -> anyhow::Result<()> {
     >::new(
         cancel_map.clone(),
         redis_publisher,
-        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
+        proxy::metrics::CancellationSource::FromClient,
     ));
 
     // client facing tasks. these will exit on error or on cancellation
@@ -387,7 +392,14 @@ async fn main() -> anyhow::Result<()> {
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
     maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
-    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
+    maintenance_tasks.spawn(http::health_server::task_main(
+        http_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
     maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
 
     if let Some(metrics_config) = &config.metric_collection {
@@ -507,8 +519,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
-                    .unwrap(),
+                console::locks::ApiLocks::new(
+                    "wake_compute_lock",
+                    permits,
+                    shards,
+                    timeout,
+                    &Metrics::get().wake_compute_lock,
+                )
+                .unwrap(),
             ));
             tokio::spawn(locks.garbage_collect_worker(epoch));
 
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 6151513614..34512e9f5b 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;
 
 use crate::{
     error::ReportableError,
-    metrics::NUM_CANCELLATION_REQUESTS,
+    metrics::{CancellationRequest, CancellationSource, Metrics},
     redis::cancellation_publisher::{
         CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
     },
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
     client: P,
     /// This field used for the monitoring purposes.
     /// Represents the source of the cancellation request.
-    from: &'static str,
+    from: CancellationSource,
 }
 
 #[derive(Debug, Error)]
@@ -89,9 +89,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         // NB: we should immediately release the lock after cloning the token.
         let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
-            NUM_CANCELLATION_REQUESTS
-                .with_label_values(&[self.from, "not_found"])
-                .inc();
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    source: self.from,
+                    kind: crate::metrics::CancellationOutcome::NotFound,
+                });
             match self.client.try_publish(key, session_id).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
@@ -103,9 +107,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             }
             return Ok(());
         };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[self.from, "found"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .cancellation_requests_total
+            .inc(CancellationRequest {
+                source: self.from,
+                kind: crate::metrics::CancellationOutcome::Found,
+            });
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -122,7 +130,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
 }
 
 impl CancellationHandler<()> {
-    pub fn new(map: CancelMap, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, from: CancellationSource) -> Self {
         Self {
             map,
             client: (),
@@ -132,7 +140,7 @@ impl CancellationHandler<()> {
 }
 
 impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
         Self { map, client, from }
     }
 }
@@ -192,15 +200,13 @@ impl<P> Drop for Session<P> {
 
 #[cfg(test)]
 mod tests {
-    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
-
     use super::*;
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
         let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
             CancelMap::default(),
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+            CancellationSource::FromRedis,
         ));
 
         let session = cancellation_handler.clone().get_session();
@@ -214,7 +220,7 @@ mod tests {
 
     #[tokio::test]
     async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
         handler
             .cancel_session(
                 CancelKeyData {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ee33b97fbd..149a619316 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -4,12 +4,11 @@ use crate::{
     console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
+    metrics::{Metrics, NumDbConnectionsGuard},
     proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use pq_proto::StartupMessageParams;
 use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
@@ -249,7 +248,7 @@ pub struct PostgresConnection {
     /// Labels for proxy's metrics.
     pub aux: MetricsAuxInfo,
 
-    _guage: IntCounterPairGuard,
+    _guage: NumDbConnectionsGuard<'static>,
 }
 
 impl ConnCfg {
@@ -295,9 +294,7 @@ impl ConnCfg {
             params,
             cancel_closure,
             aux,
-            _guage: NUM_DB_CONNECTIONS_GAUGE
-                .with_label_values(&[ctx.protocol])
-                .guard(),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
         };
 
         Ok(connection)
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 45161f5ac8..9869b95768 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,3 +1,4 @@
+use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 
@@ -102,7 +103,7 @@ pub struct MetricsAuxInfo {
     pub cold_start_info: ColdStartInfo,
 }
 
-#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
     #[default]
@@ -110,9 +111,11 @@ pub enum ColdStartInfo {
     /// Compute was already running
     Warm,
     #[serde(rename = "pool_hit")]
+    #[label(rename = "pool_hit")]
     /// Compute was not running but there was an available VM
     VmPoolHit,
     #[serde(rename = "pool_miss")]
+    #[label(rename = "pool_miss")]
     /// Compute was not running and there were no VMs available
     VmPoolMiss,
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index f7d621fb12..b9502f0722 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -13,6 +13,7 @@ use crate::{
     config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
+    metrics::ApiLockMetrics,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
@@ -441,10 +442,7 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
-    registered: prometheus::IntCounter,
-    unregistered: prometheus::IntCounter,
-    reclamation_lag: prometheus::Histogram,
-    lock_acquire_lag: prometheus::Histogram,
+    metrics: &'static ApiLockMetrics,
 }
 
 impl ApiLocks {
@@ -453,54 +451,14 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
+        metrics: &'static ApiLockMetrics,
     ) -> prometheus::Result<Self> {
-        let registered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_registered",
-                "Number of semaphores registered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(registered.clone()))?;
-        let unregistered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_unregistered",
-                "Number of semaphores unregistered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(unregistered.clone()))?;
-        let reclamation_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "reclamation_lag_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 1us -> 65ms
-            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
-            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(reclamation_lag.clone()))?;
-        let lock_acquire_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "semaphore_acquire_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 0.1ms -> 6s
-            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
-
         Ok(Self {
             name,
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
-            lock_acquire_lag,
-            registered,
-            unregistered,
-            reclamation_lag,
+            metrics,
         })
     }
 
@@ -520,7 +478,7 @@ impl ApiLocks {
                 self.node_locks
                     .entry(key.clone())
                     .or_insert_with(|| {
-                        self.registered.inc();
+                        self.metrics.semaphores_registered.inc();
                         Arc::new(Semaphore::new(self.permits))
                     })
                     .clone()
@@ -528,8 +486,9 @@ impl ApiLocks {
         };
         let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
 
-        self.lock_acquire_lag
-            .observe((Instant::now() - now).as_secs_f64());
+        self.metrics
+            .semaphore_acquire_seconds
+            .observe(now.elapsed().as_secs_f64());
 
         Ok(WakeComputePermit {
             permit: Some(permit??),
@@ -554,13 +513,13 @@ impl ApiLocks {
                     "performing epoch reclamation on api lock"
                 );
                 let mut lock = shard.write();
-                let timer = self.reclamation_lag.start_timer();
+                let timer = self.metrics.reclamation_lag_seconds.start_timer();
                 let count = lock
                     .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
                     .count();
                 drop(lock);
-                self.unregistered.inc_by(count as u64);
-                timer.observe_duration()
+                self.metrics.semaphores_unregistered.inc_by(count as u64);
+                timer.observe();
             }
         }
     }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 1a3e2ca795..9ac1900324 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -7,13 +7,14 @@ use super::{
     NodeInfo,
 };
 use crate::{
-    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
-};
-use crate::{
-    cache::Cached,
-    context::RequestMonitoring,
-    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
+    auth::backend::ComputeUserInfo,
+    compute,
+    console::messages::ColdStartInfo,
+    http,
+    metrics::{CacheOutcome, Metrics},
+    scram,
 };
+use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -95,7 +96,10 @@ impl Api {
                 Some(secret)
             };
             let allowed_ips = body.allowed_ips.unwrap_or_default();
-            ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
@@ -206,14 +210,16 @@ impl super::Api for Api {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let ep = &user_info.endpoint;
         if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
-            ALLOWED_IPS_BY_CACHE_OUTCOME
-                .with_label_values(&["hit"])
-                .inc();
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
             return Ok((allowed_ips, None));
         }
-        ALLOWED_IPS_BY_CACHE_OUTCOME
-            .with_label_values(&["miss"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fec95f4722..0094235921 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    metrics::{LatencyTimer, Metrics, Protocol},
     DbName, EndpointId, RoleName,
 };
 
@@ -29,7 +29,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
 pub struct RequestMonitoring {
     pub peer_addr: IpAddr,
     pub session_id: Uuid,
-    pub protocol: &'static str,
+    pub protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
     pub span: Span,
@@ -65,7 +65,7 @@ impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
         peer_addr: IpAddr,
-        protocol: &'static str,
+        protocol: Protocol,
         region: &'static str,
     ) -> Self {
         let span = info_span!(
@@ -102,7 +102,7 @@ impl RequestMonitoring {
 
     #[cfg(test)]
     pub fn test() -> Self {
-        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
+        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
     }
 
     pub fn console_application_name(&self) -> String {
@@ -134,9 +134,9 @@ impl RequestMonitoring {
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         if self.endpoint_id.is_none() {
             self.span.record("ep", display(&endpoint_id));
-            crate::metrics::CONNECTING_ENDPOINTS
-                .with_label_values(&[self.protocol])
-                .measure(&endpoint_id);
+            let metric = &Metrics::get().proxy.connecting_endpoints;
+            let label = metric.with_labels(self.protocol);
+            metric.get_metric(label).measure(&endpoint_id);
             self.endpoint_id = Some(endpoint_id);
         }
     }
@@ -158,13 +158,11 @@ impl RequestMonitoring {
     }
 
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        ERROR_BY_KIND
-            .with_label_values(&[kind.to_metric_label()])
-            .inc();
+        Metrics::get().proxy.errors_total.inc(kind);
         if let Some(ep) = &self.endpoint_id {
-            ENDPOINT_ERRORS_BY_KIND
-                .with_label_values(&[kind.to_metric_label()])
-                .measure(ep);
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
         }
         self.error_kind = Some(kind);
     }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index eb77409429..e061216d15 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData {
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
             }),
-            protocol: value.protocol,
+            protocol: value.protocol.as_str(),
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 4614f3913d..fdfe50a494 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,5 +1,7 @@
 use std::{error::Error as StdError, fmt, io};
 
+use measured::FixedCardinalityLabel;
+
 /// Upcast (almost) any error into an opaque [`io::Error`].
 pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
     io::Error::new(io::ErrorKind::Other, e)
@@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)]
+#[label(singleton = "type")]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
 
     /// Network error between user and proxy. Not necessarily user error
+    #[label(rename = "clientdisconnect")]
     ClientDisconnect,
 
     /// Proxy self-imposed user rate limits
+    #[label(rename = "ratelimit")]
     RateLimit,
 
     /// Proxy self-imposed service-wise rate limits
+    #[label(rename = "serviceratelimit")]
     ServiceRateLimit,
 
     /// internal errors
     Service,
 
     /// Error communicating with control plane
+    #[label(rename = "controlplane")]
     ControlPlane,
 
     /// Postgres error
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 59e1492ed4..95ca0ccd5c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,7 +13,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;
 
-use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{
+    metrics::{ConsoleRequest, Metrics},
+    rate_limiter,
+    url::ApiUrl,
+};
 use reqwest_middleware::RequestBuilder;
 
 /// This is the preferred way to create new http clients,
@@ -90,13 +94,14 @@ impl Endpoint {
 
     /// Execute a [request](reqwest::Request).
     pub async fn execute(&self, request: Request) -> Result<Response, Error> {
-        let path = request.url().path().to_string();
-        let start = Instant::now();
-        let res = self.client.execute(request).await;
-        CONSOLE_REQUEST_LATENCY
-            .with_label_values(&[&path])
-            .observe(start.elapsed().as_secs_f64());
-        res
+        let _timer = Metrics::get()
+            .proxy
+            .console_request_latency
+            .start_timer(ConsoleRequest {
+                request: request.url().path(),
+            });
+
+        self.client.execute(request).await
     }
 }
 
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index cbb17ebcb7..cae9eb5b97 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,30 +1,49 @@
 use anyhow::{anyhow, bail};
-use hyper::{Body, Request, Response, StatusCode};
-use std::{convert::Infallible, net::TcpListener};
-use tracing::info;
+use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
+use measured::{text::BufferedTextEncoder, MetricGroup};
+use metrics::NeonMetrics;
+use std::{
+    convert::Infallible,
+    net::TcpListener,
+    sync::{Arc, Mutex},
+};
+use tracing::{info, info_span};
 use utils::http::{
-    endpoint::{self, prometheus_metrics_handler, request_span},
+    endpoint::{self, request_span},
     error::ApiError,
     json::json_response,
     RouterBuilder, RouterService,
 };
 
+use crate::jemalloc;
+
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, "")
 }
 
-fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
+    let state = Arc::new(Mutex::new(PrometheusHandler {
+        encoder: BufferedTextEncoder::new(),
+        metrics,
+    }));
+
     endpoint::make_router()
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            let state = state.clone();
+            request_span(r, move |b| prometheus_metrics_handler(b, state))
+        })
         .get("/v1/status", status_handler)
 }
 
-pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
+pub async fn task_main(
+    http_listener: TcpListener,
+    metrics: AppMetrics,
+) -> anyhow::Result<Infallible> {
     scopeguard::defer! {
         info!("http has shut down");
     }
 
-    let service = || RouterService::new(make_router().build()?);
+    let service = || RouterService::new(make_router(metrics).build()?);
 
     hyper::Server::from_tcp(http_listener)?
         .serve(service().map_err(|e| anyhow!(e))?)
@@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
 
     bail!("hyper server without shutdown handling cannot shutdown successfully");
 }
+
+struct PrometheusHandler {
+    encoder: BufferedTextEncoder,
+    metrics: AppMetrics,
+}
+
+#[derive(MetricGroup)]
+pub struct AppMetrics {
+    #[metric(namespace = "jemalloc")]
+    pub jemalloc: Option<jemalloc::MetricRecorder>,
+    #[metric(flatten)]
+    pub neon_metrics: NeonMetrics,
+    #[metric(flatten)]
+    pub proxy: &'static crate::metrics::Metrics,
+}
+
+async fn prometheus_metrics_handler(
+    _req: Request<Body>,
+    state: Arc<Mutex<PrometheusHandler>>,
+) -> Result<Response<Body>, ApiError> {
+    let started_at = std::time::Instant::now();
+
+    let span = info_span!("blocking");
+    let body = tokio::task::spawn_blocking(move || {
+        let _span = span.entered();
+
+        let mut state = state.lock().unwrap();
+        let PrometheusHandler { encoder, metrics } = &mut *state;
+
+        metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+
+        let body = encoder.finish();
+
+        tracing::info!(
+            bytes = body.len(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "responded /metrics"
+        );
+
+        body
+    })
+    .await
+    .unwrap();
+
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, "text/plain; version=0.0.4")
+        .body(Body::from(body))
+        .unwrap();
+
+    Ok(response)
+}
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index ed20798d56..3243e6a140 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -1,27 +1,45 @@
-use std::time::Duration;
+use std::marker::PhantomData;
 
-use metrics::IntGauge;
-use prometheus::{register_int_gauge_with_registry, Registry};
+use measured::{
+    label::NoLabels,
+    metric::{
+        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
+        MetricEncoding, MetricFamilyEncoding, MetricType,
+    },
+    text::TextEncoder,
+    LabelGroup, MetricGroup,
+};
 use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
 
 pub struct MetricRecorder {
     epoch: epoch_mib,
-    active: stats::active_mib,
-    active_gauge: IntGauge,
-    allocated: stats::allocated_mib,
-    allocated_gauge: IntGauge,
-    mapped: stats::mapped_mib,
-    mapped_gauge: IntGauge,
-    metadata: stats::metadata_mib,
-    metadata_gauge: IntGauge,
-    resident: stats::resident_mib,
-    resident_gauge: IntGauge,
-    retained: stats::retained_mib,
-    retained_gauge: IntGauge,
+    inner: Metrics,
+}
+
+#[derive(MetricGroup)]
+struct Metrics {
+    active_bytes: JemallocGaugeFamily<stats::active_mib>,
+    allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>,
+    mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>,
+    metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>,
+    resident_bytes: JemallocGaugeFamily<stats::resident_mib>,
+    retained_bytes: JemallocGaugeFamily<stats::retained_mib>,
+}
+
+impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
+where
+    Metrics: MetricGroup<Enc>,
+{
+    fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
+        if self.epoch.advance().is_ok() {
+            self.inner.collect_group_into(enc)?;
+        }
+        Ok(())
+    }
 }
 
 impl MetricRecorder {
-    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+    pub fn new() -> Result<Self, anyhow::Error> {
         tracing::info!(
             config = config::malloc_conf::read()?,
             version = version::read()?,
@@ -30,71 +48,69 @@ impl MetricRecorder {
 
         Ok(Self {
             epoch: epoch::mib()?,
-            active: stats::active::mib()?,
-            active_gauge: register_int_gauge_with_registry!(
-                "jemalloc_active_bytes",
-                "Total number of bytes in active pages allocated by the process",
-                registry
-            )?,
-            allocated: stats::allocated::mib()?,
-            allocated_gauge: register_int_gauge_with_registry!(
-                "jemalloc_allocated_bytes",
-                "Total number of bytes allocated by the process",
-                registry
-            )?,
-            mapped: stats::mapped::mib()?,
-            mapped_gauge: register_int_gauge_with_registry!(
-                "jemalloc_mapped_bytes",
-                "Total number of bytes in active extents mapped by the allocator",
-                registry
-            )?,
-            metadata: stats::metadata::mib()?,
-            metadata_gauge: register_int_gauge_with_registry!(
-                "jemalloc_metadata_bytes",
-                "Total number of bytes dedicated to jemalloc metadata",
-                registry
-            )?,
-            resident: stats::resident::mib()?,
-            resident_gauge: register_int_gauge_with_registry!(
-                "jemalloc_resident_bytes",
-                "Total number of bytes in physically resident data pages mapped by the allocator",
-                registry
-            )?,
-            retained: stats::retained::mib()?,
-            retained_gauge: register_int_gauge_with_registry!(
-                "jemalloc_retained_bytes",
-                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
-                registry
-            )?,
-        })
-    }
-
-    fn _poll(&self) -> Result<(), anyhow::Error> {
-        self.epoch.advance()?;
-        self.active_gauge.set(self.active.read()? as i64);
-        self.allocated_gauge.set(self.allocated.read()? as i64);
-        self.mapped_gauge.set(self.mapped.read()? as i64);
-        self.metadata_gauge.set(self.metadata.read()? as i64);
-        self.resident_gauge.set(self.resident.read()? as i64);
-        self.retained_gauge.set(self.retained.read()? as i64);
-        Ok(())
-    }
-
-    #[inline]
-    pub fn poll(&self) {
-        if let Err(error) = self._poll() {
-            tracing::warn!(%error, "Failed to poll jemalloc stats");
-        }
-    }
-
-    pub fn start(self) -> tokio::task::JoinHandle<()> {
-        tokio::task::spawn(async move {
-            let mut interval = tokio::time::interval(Duration::from_secs(15));
-            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-            loop {
-                self.poll();
-                interval.tick().await;
-            }
+            inner: Metrics {
+                active_bytes: JemallocGaugeFamily(stats::active::mib()?),
+                allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?),
+                mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?),
+                metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?),
+                resident_bytes: JemallocGaugeFamily(stats::resident::mib()?),
+                retained_bytes: JemallocGaugeFamily(stats::retained::mib()?),
+            },
         })
     }
 }
+
+struct JemallocGauge<T>(PhantomData<T>);
+
+impl<T> Default for JemallocGauge<T> {
+    fn default() -> Self {
+        JemallocGauge(PhantomData)
+    }
+}
+impl<T> MetricType for JemallocGauge<T> {
+    type Metadata = T;
+}
+
+struct JemallocGaugeFamily<T>(T);
+impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
+where
+    JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
+{
+    fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
+        JemallocGauge::write_type(&name, enc)?;
+        JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
+    }
+}
+
+macro_rules! jemalloc_gauge {
+    ($stat:ident, $mib:ident) => {
+        impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
+            fn write_type(
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                GaugeState::write_type(name, enc)
+            }
+
+            fn collect_into(
+                &self,
+                mib: &stats::$mib,
+                labels: impl LabelGroup,
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                if let Ok(v) = mib.read() {
+                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                }
+                Ok(())
+            }
+        }
+    };
+}
+
+jemalloc_gauge!(active, active_mib);
+jemalloc_gauge!(allocated, allocated_mib);
+jemalloc_gauge!(mapped, mapped_mib);
+jemalloc_gauge!(metadata, metadata_mib);
+jemalloc_gauge!(resident, resident_mib);
+jemalloc_gauge!(retained, retained_mib);
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 59ee899c08..78840f5983 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,176 +1,356 @@
-use ::metrics::{
-    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
-    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec,
-};
-use metrics::{
-    register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
-    IntCounterPair,
-};
+use std::sync::OnceLock;
+
+use lasso::ThreadedRodeo;
+use measured::{
+    label::StaticLabelSet,
+    metric::{histogram::Thresholds, name::MetricName},
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
+    LabelGroup, MetricGroup,
+};
+use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
-use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
 
 use crate::console::messages::ColdStartInfo;
 
-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+#[derive(MetricGroup)]
+pub struct Metrics {
+    #[metric(namespace = "proxy")]
+    pub proxy: ProxyMetrics,
 
-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+    #[metric(namespace = "wake_compute_lock")]
+    pub wake_compute_lock: ApiLockMetrics,
 
-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+    // the one metric not called proxy_....
+    pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
+}
 
-pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 6 * 2 * 2 = 72 counters
-        &["protocol", "cold_start_info", "outcome", "excluded"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+impl Metrics {
+    pub fn get() -> &'static Self {
+        static SELF: OnceLock<Metrics> = OnceLock::new();
+        SELF.get_or_init(|| Metrics {
+            proxy: ProxyMetrics::default(),
+            wake_compute_lock: ApiLockMetrics::new(),
+            semaphore_control_plane_limit: GaugeVec::default(),
+        })
+    }
+}
 
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ProxyMetrics {
+    #[metric(flatten)]
+    pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
+    #[metric(flatten)]
+    pub client_connections: CounterPairVec<NumClientConnectionsGauge>,
+    #[metric(flatten)]
+    pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
+    #[metric(flatten)]
+    pub http_endpoint_pools: HttpEndpointPools,
+
+    /// Time it took for proxy to establish a connection to the compute endpoint.
+    // largest bucket = 2^16 * 0.5ms = 32s
+    #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))]
+    pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>,
+
+    /// Time it took for proxy to receive a response from control plane.
+    #[metric(
         // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+        metadata = Thresholds::exponential_buckets(0.0002, 2.0),
+    )]
+    pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
 
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
+    /// Time it takes to acquire a token to call console plane.
+    // largest bucket = 3^16 * 0.05ms = 2.15s
+    #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
+    pub control_plane_token_acquire_seconds: Histogram<16>,
 
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Size of the HTTP request body lengths.
+    // smallest bucket = 16 bytes
+    // largest bucket = 4^12 * 16 bytes = 256MB
+    #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))]
+    pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>,
 
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
+    /// Time it takes to reclaim unused connection pools.
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub http_pool_reclaimation_lag_seconds: Histogram<16>,
 
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
+    /// Number of opened connections to a database.
+    pub http_pool_opened_connections: Gauge,
 
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
+    /// Number of cache hits/misses for allowed ips.
+    pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
 
-pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_http_conn_content_length_bytes",
-        "Number of bytes the HTTP response content consumes",
-        // request/response
-        &["direction"],
-        // smallest bucket = 16 bytes
-        // largest bucket = 4^12 * 16 bytes = 256MB
-        exponential_buckets(16.0, 4.0, 12).unwrap()
-    )
-    .unwrap()
-});
+    /// Number of allowed ips
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_ips_number: Histogram<10>,
 
-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Number of connections (per sni).
+    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
 
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
+    /// Number of connection failures (per kind).
+    pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
 
-pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "proxy_http_pool_opened_connections",
-        "Number of opened connections to a database.",
-    )
-    .unwrap()
-});
+    /// Number of wake-up failures (per kind).
+    pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>,
 
-pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_cancellation_requests_total",
-        "Number of cancellation requests (per found/not_found).",
-        &["source", "kind"],
-    )
-    .unwrap()
-});
+    /// Number of bytes sent/received between all clients and backends.
+    pub io_bytes: CounterVec<StaticLabelSet<Direction>>,
 
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
+    /// Number of errors by a given classification.
+    pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,
+
+    /// Number of cancellation requests (per found/not_found).
+    pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
+
+    /// Number of errors by a given classification
+    pub redis_errors_total: CounterVec<RedisErrorsSet>,
+
+    /// Number of TLS handshake failures
+    pub tls_handshake_failures: Counter,
+
+    /// Number of connection requests affected by authentication rate limits
+    pub requests_auth_rate_limits_total: Counter,
+
+    /// HLL approximate cardinality of endpoints that are connecting
+    pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
+
+    /// Number of endpoints affected by errors of a given classification
+    pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
+
+    /// Number of endpoints affected by authentication rate limits
+    pub endpoints_auth_rate_limits: HyperLogLog<32>,
+}
+
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ApiLockMetrics {
+    /// Number of semaphores registered in this api lock
+    pub semaphores_registered: Counter,
+    /// Number of semaphores unregistered in this api lock
+    pub semaphores_unregistered: Counter,
+    /// Time it takes to reclaim unused semaphores in the api lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub reclamation_lag_seconds: Histogram<16>,
+    /// Time it takes to acquire a semaphore lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
+    pub semaphore_acquire_seconds: Histogram<16>,
+}
+
+impl Default for ProxyMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum HttpDirection {
+    Request,
+    Response,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum Direction {
+    Tx,
+    Rx,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "protocol")]
+pub enum Protocol {
+    Http,
+    Ws,
+    Tcp,
+    SniRouter,
+}
+
+impl Protocol {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Protocol::Http => "http",
+            Protocol::Ws => "ws",
+            Protocol::Tcp => "tcp",
+            Protocol::SniRouter => "sni_router",
+        }
+    }
+}
+
+impl std::fmt::Display for Protocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum Bool {
+    True,
+    False,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum Outcome {
+    Success,
+    Failed,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum CacheOutcome {
+    Hit,
+    Miss,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConsoleRequestSet)]
+pub struct ConsoleRequest<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub request: &'a str,
+}
+
+#[derive(MetricGroup, Default)]
+pub struct HttpEndpointPools {
+    /// Number of endpoints we have registered pools for
+    pub http_pool_endpoints_registered_total: Counter,
+    /// Number of endpoints we have unregistered pools for
+    pub http_pool_endpoints_unregistered_total: Counter,
+}
+
+pub struct HttpEndpointPoolsGuard<'a> {
+    dec: &'a Counter,
+}
+
+impl Drop for HttpEndpointPoolsGuard<'_> {
+    fn drop(&mut self) {
+        self.dec.inc();
+    }
+}
+
+impl HttpEndpointPools {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+        self.http_pool_endpoints_registered_total.inc();
+        HttpEndpointPoolsGuard {
+            dec: &self.http_pool_endpoints_unregistered_total,
+        }
+    }
+}
+pub struct NumDbConnectionsGauge;
+impl CounterPairAssoc for NumDbConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections to a database.";
+    const DEC_HELP: &'static str = "Number of closed connections to a database.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
+
+pub struct NumClientConnectionsGauge;
+impl CounterPairAssoc for NumClientConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections from a client.";
+    const DEC_HELP: &'static str = "Number of closed connections from a client.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumClientConnectionsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
+
+pub struct NumConnectionRequestsGauge;
+impl CounterPairAssoc for NumConnectionRequestsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
+    const INC_HELP: &'static str = "Number of client connections accepted.";
+    const DEC_HELP: &'static str = "Number of client connections closed.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumConnectionRequestsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
+
+#[derive(LabelGroup)]
+#[label(set = ComputeConnectionLatencySet)]
+pub struct ComputeConnectionLatencyGroup {
+    protocol: Protocol,
+    cold_start_info: ColdStartInfo,
+    outcome: ConnectOutcome,
+    excluded: LatencyExclusions,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum LatencyExclusions {
+    Client,
+    ClientAndCplane,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "limit")]
+pub enum RateLimit {
+    Actual,
+    Expected,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum SniKind {
+    Sni,
+    NoSni,
+    PasswordHack,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum ConnectionFailureKind {
+    ComputeCached,
+    ComputeUncached,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum WakeupFailureKind {
+    BadComputeAddress,
+    ApiTransportError,
+    QuotaExceeded,
+    ApiConsoleLocked,
+    ApiConsoleBadRequest,
+    ApiConsoleOtherServerError,
+    ApiConsoleOtherError,
+    TimeoutError,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConnectionFailuresBreakdownSet)]
+pub struct ConnectionFailuresBreakdownGroup {
+    pub kind: WakeupFailureKind,
+    pub retry: Bool,
+}
+
+#[derive(LabelGroup, Copy, Clone)]
+#[label(set = RedisErrorsSet)]
+pub struct RedisErrors<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub channel: &'a str,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationSource {
+    FromClient,
+    FromRedis,
+    Local,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationOutcome {
+    NotFound,
+    Found,
+}
+
+#[derive(LabelGroup)]
+#[label(set = CancellationRequestSet)]
+pub struct CancellationRequest {
+    pub source: CancellationSource,
+    pub kind: CancellationOutcome,
+}
 
 pub enum Waiting {
     Cplane,
@@ -185,20 +365,6 @@ struct Accumulated {
     compute: time::Duration,
 }
 
-enum Outcome {
-    Success,
-    Failed,
-}
-
-impl Outcome {
-    fn as_str(&self) -> &'static str {
-        match self {
-            Outcome::Success => "success",
-            Outcome::Failed => "failed",
-        }
-    }
-}
-
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: time::Instant,
@@ -207,9 +373,9 @@ pub struct LatencyTimer {
     // accumulated time on the stopwatch
     accumulated: Accumulated,
     // label data
-    protocol: &'static str,
+    protocol: Protocol,
     cold_start_info: ColdStartInfo,
-    outcome: Outcome,
+    outcome: ConnectOutcome,
 }
 
 pub struct LatencyTimerPause<'a> {
@@ -219,7 +385,7 @@ pub struct LatencyTimerPause<'a> {
 }
 
 impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
+    pub fn new(protocol: Protocol) -> Self {
         Self {
             start: time::Instant::now(),
             stop: None,
@@ -227,7 +393,7 @@ impl LatencyTimer {
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
-            outcome: Outcome::Failed,
+            outcome: ConnectOutcome::Failed,
         }
     }
 
@@ -248,7 +414,7 @@ impl LatencyTimer {
         self.stop = Some(time::Instant::now());
 
         // success
-        self.outcome = Outcome::Success;
+        self.outcome = ConnectOutcome::Success;
     }
 }
 
@@ -263,128 +429,54 @@ impl Drop for LatencyTimerPause<'_> {
     }
 }
 
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+enum ConnectOutcome {
+    Success,
+    Failed,
+}
+
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
         let duration = self
             .stop
             .unwrap_or_else(time::Instant::now)
             .duration_since(self.start);
-        // Excluding cplane communication from the accumulated time.
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client",
-            ])
-            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+
+        let metric = &Metrics::get().proxy.compute_connection_latency_seconds;
+
+        // Excluding client communication from the accumulated time.
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::Client,
+            },
+            duration
+                .saturating_sub(self.accumulated.client)
+                .as_secs_f64(),
+        );
+
         // Exclude client and cplane communication from the accumulated time.
         let accumulated_total = self.accumulated.client + self.accumulated.cplane;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client_and_cplane",
-            ])
-            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientAndCplane,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
     }
 }
 
-pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
-pub const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
+impl From<bool> for Bool {
+    fn from(value: bool) -> Self {
+        if value {
+            Bool::True
+        } else {
+            Bool::False
+        }
     }
 }
-
-pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_connecting_endpoints",
-        "HLL approximate cardinality of endpoints that are connecting",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_errors_total",
-        "Number of errors by a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_endpoints_affected_by_errors",
-        "Number of endpoints affected by errors of a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
-
-pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_redis_errors_total",
-        "Number of errors by a given classification",
-        &["channel"],
-    )
-    .unwrap()
-});
-
-pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_tls_handshake_failures",
-        "Number of TLS handshake failures",
-    )
-    .unwrap()
-});
-
-pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
-    register_hll!(
-        32,
-        "proxy_endpoints_auth_rate_limits",
-        "Number of endpoints affected by authentication rate limits",
-    )
-    .unwrap()
-});
-
-pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_requests_auth_rate_limits_total",
-        "Number of connection requests affected by authentication rate limits",
-    )
-    .unwrap()
-});
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 6051c0a812..5598215b6b 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,7 +15,7 @@ use crate::{
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
+    metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::WithClientIp,
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
@@ -24,7 +24,6 @@ use crate::{
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -79,9 +78,10 @@ pub async fn task_main(
     {
         let (socket, peer_addr) = accept_result?;
 
-        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
-            .with_label_values(&["tcp"])
-            .guard();
+        let conn_gauge = Metrics::get()
+            .proxy
+            .client_connections
+            .guard(crate::metrics::Protocol::Tcp);
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
@@ -113,7 +113,12 @@ pub async fn task_main(
                 },
             };
 
-            let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let mut ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr,
+                    crate::metrics::Protocol::Tcp,
+                    &config.region,
+                );
             let span = ctx.span.clone();
 
             let res = handle_client(
@@ -237,14 +242,17 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    conn_gauge: IntCounterPairGuard,
+    conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!("handling interactive connection from client");
+    info!(
+        protocol = %ctx.protocol,
+        "handling interactive connection from client"
+    );
 
+    let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol;
-    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[proto])
-        .guard();
+    // let _client_gauge = metrics.client_connections.guard(proto);
+    let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 4c0d68ce0b..33f394c550 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -4,7 +4,7 @@ use crate::{
     console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::NUM_CONNECTION_FAILURES,
+    metrics::{ConnectionFailureKind, Metrics},
     proxy::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
@@ -27,10 +27,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
         warn!("invalidating stalled compute node info cache entry");
     }
     let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
+        true => ConnectionFailureKind::ComputeCached,
+        false => ConnectionFailureKind::ComputeUncached,
     };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+    Metrics::get().proxy.connection_failures_total.inc(label);
 
     node_info.invalidate()
 }
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c81a1a8292..62de79946f 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -2,11 +2,10 @@ use crate::{
     cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
-    metrics::NUM_BYTES_PROXIED_COUNTER,
+    metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
     stream::Stream,
     usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
 };
-use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
@@ -23,24 +22,25 @@ pub async fn proxy_pass(
         branch_id: aux.branch_id,
     });
 
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let metrics = &Metrics::get().proxy.io_bytes;
+    let m_sent = metrics.with_labels(Direction::Tx);
     let mut client = MeasuredStream::new(
         client,
         |_| {},
         |cnt| {
             // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
+            metrics.get_metric(m_sent).inc_by(cnt as u64);
             usage.record_egress(cnt as u64);
         },
     );
 
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv = metrics.with_labels(Direction::Rx);
     let mut compute = MeasuredStream::new(
         compute,
         |_| {},
         |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
+            metrics.get_metric(m_recv).inc_by(cnt as u64);
         },
     );
 
@@ -60,8 +60,8 @@ pub struct ProxyPassthrough<P, S> {
     pub compute: PostgresConnection,
     pub aux: MetricsAuxInfo,
 
-    pub req: IntCounterPairGuard,
-    pub conn: IntCounterPairGuard,
+    pub req: NumConnectionRequestsGuard<'static>,
+    pub conn: NumClientConnectionsGuard<'static>,
     pub cancel: cancellation::Session<P>,
 }
 
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index bfe4b7ec3a..f8154b1a94 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,6 +1,6 @@
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
+use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
 use crate::proxy::retry::retry_after;
 use hyper::StatusCode;
 use std::ops::ControlFlow;
@@ -57,39 +57,46 @@ pub fn handle_try_wake(
 
 fn report_error(e: &WakeComputeError, retry: bool) {
     use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
     let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
+        WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ref text,
         }) if text.contains("written data quota exceeded")
             || text.contains("the limit for current plan reached") =>
         {
-            "quota_exceeded"
+            WakeupFailureKind::QuotaExceeded
         }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::UNPROCESSABLE_ENTITY,
             ref text,
         }) if text.contains("compute time quota of non-primary branches is exceeded") => {
-            "quota_exceeded"
+            WakeupFailureKind::QuotaExceeded
         }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ..
-        }) => "api_console_locked",
+        }) => WakeupFailureKind::ApiConsoleLocked,
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::BAD_REQUEST,
             ..
-        }) => "api_console_bad_request",
+        }) => WakeupFailureKind::ApiConsoleBadRequest,
         WakeComputeError::ApiError(ApiError::Console { status, .. })
             if status.is_server_error() =>
         {
-            "api_console_other_server_error"
+            WakeupFailureKind::ApiConsoleOtherServerError
         }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
+        WakeComputeError::ApiError(ApiError::Console { .. }) => {
+            WakeupFailureKind::ApiConsoleOtherError
+        }
+        WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
     };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+    Metrics::get()
+        .proxy
+        .connection_failures_breakdown
+        .inc(ConnectionFailuresBreakdownGroup {
+            kind,
+            retry: retry.into(),
+        });
 }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index f590896dd9..aba5120f38 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -17,7 +17,13 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
-use crate::{intern::EndpointIdInt, EndpointId};
+use crate::{
+    intern::EndpointIdInt,
+    {
+        metrics::{Metrics, RateLimit},
+        EndpointId,
+    },
+};
 
 use super::{
     limit_algorithm::{LimitAlgorithm, Sample},
@@ -457,12 +463,9 @@ impl Limiter {
             }
             new_limit
         };
-        crate::metrics::RATE_LIMITER_LIMIT
-            .with_label_values(&["expected"])
-            .set(new_limit as i64);
-        crate::metrics::RATE_LIMITER_LIMIT
-            .with_label_values(&["actual"])
-            .set(actual_limit as i64);
+        let metric = &Metrics::get().semaphore_control_plane_limit;
+        metric.set(RateLimit::Expected, new_limit as i64);
+        metric.set(RateLimit::Actual, actual_limit as i64);
         self.limits.store(new_limit, Ordering::Release);
         #[cfg(test)]
         if let Some(n) = &self.notifier {
@@ -519,7 +522,10 @@ impl reqwest_middleware::Middleware for Limiter {
         extensions: &mut task_local_extensions::Extensions,
         next: reqwest_middleware::Next<'_>,
     ) -> reqwest_middleware::Result<reqwest::Response> {
-        let start = Instant::now();
+        let timer = Metrics::get()
+            .proxy
+            .control_plane_token_acquire_seconds
+            .start_timer();
         let token = self
             .acquire_timeout(self.config.timeout)
             .await
@@ -533,8 +539,12 @@ impl reqwest_middleware::Middleware for Limiter {
                     .into(),
                 )
             })?;
-        info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        let duration = timer.observe();
+        info!(
+            ?duration,
+            "waiting for token to connect to the control plane"
+        );
+
         match next.run(req, extensions).await {
             Ok(response) => {
                 self.release(token, Some(Outcome::from_reqwest_response(&response)))
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 8b7e3e3419..5a38530faf 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -11,7 +11,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
+    metrics::{Metrics, RedisErrors},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -104,9 +104,9 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
-                REDIS_BROKEN_MESSAGES
-                    .with_label_values(&[msg.get_channel_name()])
-                    .inc();
+                Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                    channel: msg.get_channel_name(),
+                });
                 tracing::error!("broken message: {e}");
                 return Ok(());
             }
@@ -183,7 +183,7 @@ where
         cache,
         Arc::new(CancellationHandler::<()>::new(
             cancel_map,
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+            crate::metrics::CancellationSource::FromRedis,
         )),
         region_id,
     );
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index f275caa7eb..24c94fadd8 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -32,7 +32,7 @@ use tokio_util::task::TaskTracker;
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
-use crate::metrics::{NUM_CLIENT_CONNECTION_GAUGE, TLS_HANDSHAKE_FAILURES};
+use crate::metrics::Metrics;
 use crate::protocol2::WithClientIp;
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
@@ -156,9 +156,10 @@ async fn connection_handler(
 ) {
     let session_id = uuid::Uuid::new_v4();
 
-    let _gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&["http"])
-        .guard();
+    let _gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Http);
 
     // handle PROXY protocol
     let mut conn = WithClientIp::new(conn);
@@ -181,13 +182,13 @@ async fn connection_handler(
         }
         // The handshake failed
         Ok(Err(e)) => {
-            TLS_HANDSHAKE_FAILURES.inc();
+            Metrics::get().proxy.tls_handshake_failures.inc();
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
         // The handshake timed out
         Err(e) => {
-            TLS_HANDSHAKE_FAILURES.inc();
+            Metrics::get().proxy.tls_handshake_failures.inc();
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
@@ -274,7 +275,13 @@ async fn request_handler(
 
     // Check if the request is a websocket upgrade request.
     if hyper_tungstenite::is_upgrade_request(&request) {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+        let ctx = RequestMonitoring::new(
+            session_id,
+            peer_addr,
+            crate::metrics::Protocol::Ws,
+            &config.region,
+        );
+
         let span = ctx.span.clone();
         info!(parent: &span, "performing websocket upgrade");
 
@@ -302,7 +309,12 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+        let ctx = RequestMonitoring::new(
+            session_id,
+            peer_addr,
+            crate::metrics::Protocol::Http,
+            &config.region,
+        );
         let span = ctx.span.clone();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 35311facb8..131f088880 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,6 +1,5 @@
 use dashmap::DashMap;
 use futures::{future::poll_fn, Future};
-use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
@@ -18,11 +17,10 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
-    DbName, EndpointCacheKey, RoleName,
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
 };
 
 use tracing::{debug, error, warn, Span};
@@ -78,7 +76,7 @@ pub struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
-    _guard: IntCounterPairGuard,
+    _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
     global_pool_size_max_conns: usize,
 }
@@ -110,7 +108,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             let removed = old_len - new_len;
             if removed > 0 {
                 global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-                NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
             }
             *total_conns -= removed;
             removed > 0
@@ -156,7 +158,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
                 pool.total_conns += 1;
                 pool.global_connections_count
                     .fetch_add(1, atomic::Ordering::Relaxed);
-                NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
             }
 
             pool.total_conns
@@ -176,7 +182,11 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
         if self.total_conns > 0 {
             self.global_connections_count
                 .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
-            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
         }
     }
 }
@@ -215,7 +225,11 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
             removed += 1;
         }
         global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-        NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
         conn
     }
 }
@@ -303,7 +317,10 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         // acquire a random shard lock
         let mut shard = self.global_pool.shards()[shard].write();
 
-        let timer = GC_LATENCY.start_timer();
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
         let current_len = shard.len();
         let mut clients_removed = 0;
         shard.retain(|endpoint, x| {
@@ -331,7 +348,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
         let new_len = shard.len();
         drop(shard);
-        timer.observe_duration();
+        timer.observe();
 
         // Do logging outside of the lock.
         if clients_removed > 0 {
@@ -339,7 +356,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 .global_connections_count
                 .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
                 - clients_removed;
-            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
             info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
         }
         let removed = current_len - new_len;
@@ -410,7 +431,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             pools: HashMap::new(),
             total_conns: 0,
             max_conns: self.config.pool_options.max_conns_per_endpoint,
-            _guard: ENDPOINT_POOLS.guard(),
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
             global_connections_count: self.global_connections_count.clone(),
             global_pool_size_max_conns: self.config.pool_options.max_total_conns,
         }));
@@ -450,9 +471,7 @@ pub fn poll_client<C: ClientInnerExt>(
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
-        .with_label_values(&[ctx.protocol])
-        .guard();
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
     let mut session_id = ctx.session_id;
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7f7f93988c..a66edb2c66 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -43,8 +43,8 @@ use crate::context::RequestMonitoring;
 use crate::error::ErrorKind;
 use crate::error::ReportableError;
 use crate::error::UserFacingError;
-use crate::metrics::HTTP_CONTENT_LENGTH;
-use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::metrics::HttpDirection;
+use crate::metrics::Metrics;
 use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
@@ -494,10 +494,11 @@ async fn handle_inner(
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
-    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[ctx.protocol])
-        .guard();
-    info!("handling interactive connection from client");
+    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
+    info!(
+        protocol = %ctx.protocol,
+        "handling interactive connection from client"
+    );
 
     //
     // Determine the destination and connection params
@@ -520,9 +521,10 @@ async fn handle_inner(
         None => MAX_REQUEST_SIZE + 1,
     };
     info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["request"])
-        .observe(request_content_length as f64);
+    Metrics::get()
+        .proxy
+        .http_conn_content_length_bytes
+        .observe(HttpDirection::Request, request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -607,9 +609,10 @@ async fn handle_inner(
     // count the egress bytes - we miss the TLS and header overhead but oh well...
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["response"])
-        .observe(len as f64);
+    Metrics::get()
+        .proxy
+        .http_conn_content_length_bytes
+        .observe(HttpDirection::Response, len as f64);
 
     Ok(response)
 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index ada6c974f4..d054877126 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,7 +3,7 @@ use crate::{
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
-    metrics::NUM_CLIENT_CONNECTION_GAUGE,
+    metrics::Metrics,
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
 };
@@ -139,9 +139,10 @@ pub async fn serve_websocket(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
-    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&["ws"])
-        .guard();
+    let conn_gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Ws);
 
     let res = handle_client(
         config,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index b6b7a85659..fdd2be3ee5 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,6 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
-use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::metrics::Metrics;
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -228,7 +228,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
             Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
                 .accept(raw)
                 .await
-                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
+                .inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From 40f15c31235242ffdefc8b3662ba252cec55377e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 11 Apr 2024 20:24:34 +0200
Subject: [PATCH 016/157] Read cplane events from regional redis (#7352)

## Problem

Actually read redis events.

## Summary of changes

This is revert of https://github.com/neondatabase/neon/pull/7350 +
fixes.
* Fixed events parsing
* Added timeout after connection failure
* Separated regional and global redis clients.
---
 proxy/src/auth/backend.rs                     |   4 +-
 proxy/src/bin/proxy.rs                        |  61 +++--
 proxy/src/cache.rs                            |   1 +
 proxy/src/cache/endpoints.rs                  | 226 ++++++++++++++++++
 proxy/src/config.rs                           |  74 ++++++
 proxy/src/console/provider.rs                 |  17 +-
 proxy/src/console/provider/neon.rs            |  47 ++--
 proxy/src/context.rs                          |  22 +-
 proxy/src/intern.rs                           |  15 ++
 proxy/src/lib.rs                              |  37 +++
 proxy/src/metrics.rs                          |  13 +-
 proxy/src/proxy.rs                            |   4 +-
 proxy/src/rate_limiter.rs                     |   2 +-
 proxy/src/rate_limiter/limiter.rs             |  10 +-
 proxy/src/redis/cancellation_publisher.rs     |   6 +-
 .../regress/test_proxy_rate_limiter.py        |  84 -------
 16 files changed, 479 insertions(+), 144 deletions(-)
 create mode 100644 proxy/src/cache/endpoints.rs
 delete mode 100644 test_runner/regress/test_proxy_rate_limiter.py

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 229d499e30..ab5dd4544b 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -186,7 +186,7 @@ impl AuthenticationConfig {
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint);
+        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3392c21075..2e749fc7e8 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -190,7 +190,9 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -301,27 +303,27 @@ async fn main() -> anyhow::Result<()> {
         ),
         aws_credentials_provider,
     ));
-    let redis_notifications_client =
-        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
-            (Some(url), _) => {
-                info!("Starting redis notifications listener ({url})");
-                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
-            }
-            (None, (Some(host), Some(port))) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host,
-                    port,
-                    elasticache_credentials_provider.clone(),
-                ),
+    let regional_redis_client = match (args.redis_host, args.redis_port) {
+        (Some(host), Some(port)) => Some(
+            ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                host,
+                port,
+                elasticache_credentials_provider.clone(),
             ),
-            (None, (None, None)) => {
-                warn!("Redis is disabled");
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        };
+        ),
+        (None, None) => {
+            warn!("Redis events from console are disabled");
+            None
+        }
+        _ => {
+            bail!("redis-host and redis-port must be specified together");
+        }
+    };
+    let redis_notifications_client = if let Some(url) = args.redis_notifications {
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+    } else {
+        regional_redis_client.clone()
+    };
 
     // Check that we can bind to address before further initialization
     let http_address: SocketAddr = args.http.parse()?;
@@ -340,8 +342,7 @@ async fn main() -> anyhow::Result<()> {
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     let cancel_map = CancelMap::default();
 
-    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
@@ -416,13 +417,18 @@ async fn main() -> anyhow::Result<()> {
             if let Some(redis_notifications_client) = redis_notifications_client {
                 let cache = api.caches.project_info.clone();
                 maintenance_tasks.spawn(notifications::task_main(
-                    redis_notifications_client.clone(),
+                    redis_notifications_client,
                     cache.clone(),
                     cancel_map.clone(),
                     args.region.clone(),
                 ));
                 maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
             }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                maintenance_tasks.spawn(async move { cache.do_read(con).await });
+            }
         }
     }
 
@@ -501,14 +507,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
+                endpoint_cache_config,
             )));
 
             let config::WakeComputeLockOptions {
@@ -524,11 +534,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                     permits,
                     shards,
                     timeout,
+                    epoch,
                     &Metrics::get().wake_compute_lock,
                 )
                 .unwrap(),
             ));
-            tokio::spawn(locks.garbage_collect_worker(epoch));
+            tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index fc5f416395..d1d4087241 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,4 +1,5 @@
 pub mod common;
+pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
new file mode 100644
index 0000000000..f3f9e9395f
--- /dev/null
+++ b/proxy/src/cache/endpoints.rs
@@ -0,0 +1,226 @@
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use dashmap::DashSet;
+use redis::{
+    streams::{StreamReadOptions, StreamReadReply},
+    AsyncCommands, FromRedisValue, Value,
+};
+use serde::Deserialize;
+use tokio::sync::Mutex;
+
+use crate::{
+    config::EndpointCacheConfig,
+    context::RequestMonitoring,
+    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
+    metrics::{Metrics, RedisErrors},
+    rate_limiter::GlobalRateLimiter,
+    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    EndpointId,
+};
+
+#[derive(Deserialize, Debug, Clone)]
+pub struct ControlPlaneEventKey {
+    endpoint_created: Option<EndpointCreated>,
+    branch_created: Option<BranchCreated>,
+    project_created: Option<ProjectCreated>,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct EndpointCreated {
+    endpoint_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct BranchCreated {
+    branch_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct ProjectCreated {
+    project_id: String,
+}
+
+pub struct EndpointsCache {
+    config: EndpointCacheConfig,
+    endpoints: DashSet<EndpointIdInt>,
+    branches: DashSet<BranchIdInt>,
+    projects: DashSet<ProjectIdInt>,
+    ready: AtomicBool,
+    limiter: Arc<Mutex<GlobalRateLimiter>>,
+}
+
+impl EndpointsCache {
+    pub fn new(config: EndpointCacheConfig) -> Self {
+        Self {
+            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
+                config.limiter_info.clone(),
+            ))),
+            config,
+            endpoints: DashSet::new(),
+            branches: DashSet::new(),
+            projects: DashSet::new(),
+            ready: AtomicBool::new(false),
+        }
+    }
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+        if !self.ready.load(Ordering::Acquire) {
+            return true;
+        }
+        // If cache is disabled, just collect the metrics and return.
+        if self.config.disable_cache {
+            ctx.set_rejected(self.should_reject(endpoint));
+            return true;
+        }
+        // If the limiter allows, we don't need to check the cache.
+        if self.limiter.lock().await.check() {
+            return true;
+        }
+        let rejected = self.should_reject(endpoint);
+        ctx.set_rejected(rejected);
+        !rejected
+    }
+    fn should_reject(&self, endpoint: &EndpointId) -> bool {
+        if endpoint.is_endpoint() {
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
+        } else if endpoint.is_branch() {
+            !self
+                .branches
+                .contains(&BranchIdInt::from(&endpoint.as_branch()))
+        } else {
+            !self
+                .projects
+                .contains(&ProjectIdInt::from(&endpoint.as_project()))
+        }
+    }
+    fn insert_event(&self, key: ControlPlaneEventKey) {
+        // Do not do normalization here, we expect the events to be normalized.
+        if let Some(endpoint_created) = key.endpoint_created {
+            self.endpoints
+                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+        }
+        if let Some(branch_created) = key.branch_created {
+            self.branches
+                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+        }
+        if let Some(project_created) = key.project_created {
+            self.projects
+                .insert(ProjectIdInt::from(&project_created.project_id.into()));
+        }
+    }
+    pub async fn do_read(
+        &self,
+        mut con: ConnectionWithCredentialsProvider,
+    ) -> anyhow::Result<Infallible> {
+        let mut last_id = "0-0".to_string();
+        loop {
+            self.ready.store(false, Ordering::Release);
+            if let Err(e) = con.connect().await {
+                tracing::error!("error connecting to redis: {:?}", e);
+                continue;
+            }
+            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
+                tracing::error!("error reading from redis: {:?}", e);
+            }
+            tokio::time::sleep(self.config.retry_interval).await;
+        }
+    }
+    async fn read_from_stream(
+        &self,
+        con: &mut ConnectionWithCredentialsProvider,
+        last_id: &mut String,
+    ) -> anyhow::Result<()> {
+        tracing::info!("reading endpoints/branches/projects from redis");
+        self.batch_read(
+            con,
+            StreamReadOptions::default().count(self.config.initial_batch_size),
+            last_id,
+            true,
+        )
+        .await?;
+        tracing::info!("ready to filter user requests");
+        self.ready.store(true, Ordering::Release);
+        self.batch_read(
+            con,
+            StreamReadOptions::default()
+                .count(self.config.default_batch_size)
+                .block(self.config.xread_timeout.as_millis() as usize),
+            last_id,
+            false,
+        )
+        .await
+    }
+    fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
+        let s: String = FromRedisValue::from_redis_value(value)?;
+        Ok(serde_json::from_str(&s)?)
+    }
+    async fn batch_read(
+        &self,
+        conn: &mut ConnectionWithCredentialsProvider,
+        opts: StreamReadOptions,
+        last_id: &mut String,
+        return_when_finish: bool,
+    ) -> anyhow::Result<()> {
+        let mut total: usize = 0;
+        loop {
+            let mut res: StreamReadReply = conn
+                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
+                .await?;
+
+            if res.keys.is_empty() {
+                if return_when_finish {
+                    anyhow::bail!(
+                        "Redis stream {} is empty, cannot be used to filter endpoints",
+                        self.config.stream_name
+                    );
+                }
+                // If we are not returning when finish, we should wait for more data.
+                continue;
+            }
+            if res.keys.len() != 1 {
+                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
+            }
+
+            let res = res.keys.pop().expect("Checked length above");
+            let len = res.ids.len();
+            for x in res.ids {
+                total += 1;
+                for (_, v) in x.map {
+                    let key = match Self::parse_key_value(&v) {
+                        Ok(x) => x,
+                        Err(e) => {
+                            Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                                channel: &self.config.stream_name,
+                            });
+                            tracing::error!("error parsing value {v:?}: {e:?}");
+                            continue;
+                        }
+                    };
+                    self.insert_event(key);
+                }
+                if total.is_power_of_two() {
+                    tracing::debug!("endpoints read {}", total);
+                }
+                *last_id = x.id;
+            }
+            if return_when_finish && len <= self.config.default_batch_size {
+                break;
+            }
+        }
+        tracing::info!("read {} endpoints/branches/projects from redis", total);
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ControlPlaneEventKey;
+
+    #[test]
+    fn test() {
+        let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
+        let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
+    }
+}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index fc490c7348..b4b2ce8dbd 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -313,6 +313,80 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct EndpointCacheConfig {
+    /// Batch size to receive all endpoints on the startup.
+    pub initial_batch_size: usize,
+    /// Batch size to receive endpoints.
+    pub default_batch_size: usize,
+    /// Timeouts for the stream read operation.
+    pub xread_timeout: Duration,
+    /// Stream name to read from.
+    pub stream_name: String,
+    /// Limiter info (to distinguish when to enable cache).
+    pub limiter_info: Vec<RateBucketInfo>,
+    /// Disable cache.
+    /// If true, cache is ignored, but reports all statistics.
+    pub disable_cache: bool,
+    /// Retry interval for the stream read operation.
+    pub retry_interval: Duration,
+}
+
+impl EndpointCacheConfig {
+    /// Default options for [`crate::console::provider::NodeInfoCache`].
+    /// Notice that by default the limiter is empty, which means that cache is disabled.
+    pub const CACHE_DEFAULT_OPTIONS: &'static str =
+        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
+
+    /// Parse cache options passed via cmdline.
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut initial_batch_size = None;
+        let mut default_batch_size = None;
+        let mut xread_timeout = None;
+        let mut stream_name = None;
+        let mut limiter_info = vec![];
+        let mut disable_cache = false;
+        let mut retry_interval = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
+                "default_batch_size" => default_batch_size = Some(value.parse()?),
+                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
+                "stream_name" => stream_name = Some(value.to_string()),
+                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
+                "disable_cache" => disable_cache = value.parse()?,
+                "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+        RateBucketInfo::validate(&mut limiter_info)?;
+
+        Ok(Self {
+            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
+            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
+            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
+            stream_name: stream_name.context("missing `stream_name`")?,
+            disable_cache,
+            limiter_info,
+            retry_interval: retry_interval.context("missing `retry_interval`")?,
+        })
+    }
+}
+
+impl FromStr for EndpointCacheConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse endpoint cache options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index b9502f0722..3fa7221f98 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,9 +8,9 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, ProjectInfoCacheOptions},
+    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
     metrics::ApiLockMetrics,
@@ -417,12 +417,15 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
+    /// List of all valid endpoints.
+    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
+        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -432,6 +435,7 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
+            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
@@ -442,6 +446,7 @@ pub struct ApiLocks {
     node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
+    epoch: std::time::Duration,
     metrics: &'static ApiLockMetrics,
 }
 
@@ -451,6 +456,7 @@ impl ApiLocks {
         permits: usize,
         shards: usize,
         timeout: Duration,
+        epoch: std::time::Duration,
         metrics: &'static ApiLockMetrics,
     ) -> prometheus::Result<Self> {
         Ok(Self {
@@ -458,6 +464,7 @@ impl ApiLocks {
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
+            epoch,
             metrics,
         })
     }
@@ -495,12 +502,12 @@ impl ApiLocks {
         })
     }
 
-    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+    pub async fn garbage_collect_worker(&self) {
         if self.permits == 0 {
             return;
         }
-
-        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+        let mut interval =
+            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 9ac1900324..138acdf578 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::ColdStartInfo,
     http,
     metrics::{CacheOutcome, Metrics},
-    scram,
+    scram, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -24,7 +24,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks,
     jwt: String,
 }
 
@@ -56,6 +56,15 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+            .await
+        {
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -82,7 +91,9 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
+                    Some(http::StatusCode::NOT_FOUND) => {
+                        return Ok(AuthInfo::default());
+                    }
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -178,23 +189,27 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let ep = &user_info.endpoint;
+        let normalized_ep = &user_info.endpoint.normalize();
         let user = &user_info.user;
-        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -208,8 +223,8 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let ep = &user_info.endpoint;
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
             Metrics::get()
                 .proxy
                 .allowed_ips_cache_misses
@@ -224,16 +239,18 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches
-                .project_info
-                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
             ctx.set_project_id(project_id);
         }
         Ok((
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 0094235921..dc475d57ed 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -12,7 +12,7 @@ use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, Metrics, Protocol},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
     DbName, EndpointId, RoleName,
 };
 
@@ -50,6 +50,8 @@ pub struct RequestMonitoring {
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
+    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
+    rejected: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -93,6 +95,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            rejected: false,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -113,6 +116,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_rejected(&mut self, rejected: bool) {
+        self.rejected = rejected;
+    }
+
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -176,6 +183,19 @@ impl RequestMonitoring {
 
 impl Drop for RequestMonitoring {
     fn drop(&mut self) {
+        let outcome = if self.success {
+            ConnectOutcome::Success
+        } else {
+            ConnectOutcome::Failed
+        };
+        Metrics::get()
+            .proxy
+            .invalid_endpoints_total
+            .inc(InvalidEndpointsGroup {
+                protocol: self.protocol,
+                rejected: self.rejected.into(),
+                outcome,
+            });
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index a6519bdff9..e38135dd22 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<EndpointId> for EndpointIdInt {
+    fn from(value: EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<BranchId> for BranchIdInt {
+    fn from(value: BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<ProjectId> for ProjectIdInt {
+    fn from(value: ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index da7c7f3ed2..3f6d985fe8 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper {
     };
 }
 
+const POOLER_SUFFIX: &str = "-pooler";
+
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+    fn normalize(&self) -> Self {
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
+        } else {
+            self.clone()
+        }
+    }
+}
+
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -140,3 +158,22 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    pub fn is_project(&self) -> bool {
+        !self.is_endpoint() && !self.is_branch()
+    }
+    pub fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 78840f5983..b96950b0a2 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -120,6 +120,9 @@ pub struct ProxyMetrics {
 
     /// Number of endpoints affected by authentication rate limits
     pub endpoints_auth_rate_limits: HyperLogLog<32>,
+
+    /// Number of invalid endpoints (per protocol, per rejected).
+    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
 }
 
 #[derive(MetricGroup)]
@@ -430,7 +433,7 @@ impl Drop for LatencyTimerPause<'_> {
 }
 
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
-enum ConnectOutcome {
+pub enum ConnectOutcome {
     Success,
     Failed,
 }
@@ -480,3 +483,11 @@ impl From<bool> for Bool {
         }
     }
 }
+
+#[derive(LabelGroup)]
+#[label(set = InvalidEndpointsSet)]
+pub struct InvalidEndpointsGroup {
+    pub protocol: Protocol,
+    pub rejected: Bool,
+    pub outcome: ConnectOutcome,
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5598215b6b..42fb10b326 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use crate::{
     proxy::handshake::{handshake, HandshakeData},
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey,
+    EndpointCacheKey, Normalize,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -288,7 +288,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     // check rate limit
     if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep, 1) {
+        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
             return stream
                 .throw_error(auth::AuthError::too_many_connections())
                 .await?;
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 13dffffca0..a3b83e5e50 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index aba5120f38..7e9370f606 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -30,13 +30,13 @@ use super::{
     RateLimiterConfig,
 };
 
-pub struct RedisRateLimiter {
+pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
-    info: &'static [RateBucketInfo],
+    info: Vec<RateBucketInfo>,
 }
 
-impl RedisRateLimiter {
-    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+impl GlobalRateLimiter {
+    pub fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -56,7 +56,7 @@ impl RedisRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(self.info)
+            .zip(&self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 422789813c..7baf104374 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: RedisRateLimiter,
+    limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: RedisRateLimiter::new(info),
+            limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
deleted file mode 100644
index f39f0cad07..0000000000
--- a/test_runner/regress/test_proxy_rate_limiter.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import asyncio
-import time
-from pathlib import Path
-from typing import Iterator
-
-import pytest
-from fixtures.neon_fixtures import (
-    PSQL,
-    NeonProxy,
-)
-from fixtures.port_distributor import PortDistributor
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.response import Response
-
-
-def waiting_handler(status_code: int) -> Response:
-    # wait more than timeout to make sure that both (two) connections are open.
-    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
-    time.sleep(2)
-    return Response(status=status_code)
-
-
-@pytest.fixture(scope="function")
-def proxy_with_rate_limit(
-    port_distributor: PortDistributor,
-    neon_binpath: Path,
-    httpserver_listen_address,
-    test_output_dir: Path,
-) -> Iterator[NeonProxy]:
-    """Neon proxy that routes directly to vanilla postgres."""
-
-    proxy_port = port_distributor.get_port()
-    mgmt_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
-    external_http_port = port_distributor.get_port()
-    (host, port) = httpserver_listen_address
-    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    with NeonProxy(
-        neon_binpath=neon_binpath,
-        test_output_dir=test_output_dir,
-        proxy_port=proxy_port,
-        http_port=http_port,
-        mgmt_port=mgmt_port,
-        external_http_port=external_http_port,
-        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
-    ) as proxy:
-        proxy.start()
-        yield proxy
-
-
-@pytest.mark.asyncio
-async def test_proxy_rate_limit(
-    httpserver: HTTPServer,
-    proxy_with_rate_limit: NeonProxy,
-):
-    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
-    # mock control plane service
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: Response(status=200)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(429)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(500)
-    )
-
-    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-    # Limit should be 2.
-
-    # Run two queries in parallel.
-    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
-    await proxy_with_rate_limit.find_auth_link(uri, f1)
-    await proxy_with_rate_limit.find_auth_link(uri, f2)
-
-    # Now limit should be 0.
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-
-    # There last query shouldn't reach the http-server.
-    assert httpserver.assertions == []

From e92fb94149967d5eca3eccddcdd718149d3d7031 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 11 Apr 2024 21:55:05 +0100
Subject: [PATCH 017/157] proxy: fix overloaded db connection closure (#7364)

## Problem

possible for the database connections to not close in time.

## Summary of changes

force the closing of connections if the client has hung up
---
 proxy/src/serverless/conn_pool.rs | 36 +++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 131f088880..798e488509 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -15,6 +15,7 @@ use std::{
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_util::sync::CancellationToken;
 
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
@@ -488,15 +489,32 @@ pub fn poll_client<C: ClientInnerExt>(
 
     let db_user = conn_info.db_and_user();
     let idle = global_pool.get_idle_timeout();
+    let cancel = CancellationToken::new();
+    let cancelled = cancel.clone().cancelled_owned();
+
     tokio::spawn(
     async move {
         let _conn_gauge = conn_gauge;
         let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        let mut cancelled = pin!(cancelled);
+
         poll_fn(move |cx| {
-            if matches!(rx.has_changed(), Ok(true)) {
-                session_id = *rx.borrow_and_update();
-                info!(%session_id, "changed session");
-                idle_timeout.as_mut().reset(Instant::now() + idle);
+            if cancelled.as_mut().poll(cx).is_ready() {
+                info!("connection dropped");
+                return Poll::Ready(())
+            }
+
+            match rx.has_changed() {
+                Ok(true) => {
+                    session_id = *rx.borrow_and_update();
+                    info!(%session_id, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+                Err(_) => {
+                    info!("connection dropped");
+                    return Poll::Ready(())
+                }
+                _ => {}
             }
 
             // 5 minute idle connection timeout
@@ -551,6 +569,7 @@ pub fn poll_client<C: ClientInnerExt>(
     let inner = ClientInner {
         inner: client,
         session: tx,
+        cancel,
         aux,
         conn_id,
     };
@@ -560,10 +579,18 @@ pub fn poll_client<C: ClientInnerExt>(
 struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
+    cancel: CancellationToken,
     aux: MetricsAuxInfo,
     conn_id: uuid::Uuid,
 }
 
+impl<C: ClientInnerExt> Drop for ClientInner<C> {
+    fn drop(&mut self) {
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
+    }
+}
+
 pub trait ClientInnerExt: Sync + Send + 'static {
     fn is_closed(&self) -> bool;
     fn get_process_id(&self) -> i32;
@@ -716,6 +743,7 @@ mod tests {
         ClientInner {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            cancel: CancellationToken::new(),
             aux: MetricsAuxInfo {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),

From 94505fd67288e0301c32763348c7b75f0b63e514 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 11 Apr 2024 23:35:30 +0100
Subject: [PATCH 018/157] CI: speed up Allure reports upload (#7362)

## Problem

`create-test-report` job takes more than 8 minutes, the longest step is
uploading Allure report to S3:

Before:
```
+ aws s3 cp --recursive --only-show-errors /tmp/pr-7362-1712847045/report s3://neon-github-public-dev/reports/pr-7362/8647730612

real	6m10.572s
user	6m37.717s
sys	1m9.429s
```

After:
```
+ s5cmd --log error cp '/tmp/pr-7362-1712858221/report/*' s3://neon-github-public-dev/reports/pr-7362/8650636861/

real	0m9.698s
user	1m9.438s
sys	0m6.419s
```

## Summary of changes
- Add `s5cmd`(https://github.com/peak/s5cmd) to build-tools image
- Use `s5cmd` instead of `aws s3` for uploading Allure reports
---
 .github/actions/allure-report-generate/action.yml | 2 +-
 Dockerfile.build-tools                            | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 1ecb5ecc7e..f84beff20c 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:
 
         # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
         # and to keep files on the host to upload them to the database
-        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
 
         # Generate redirect
         cat <<EOF > ${WORKDIR}/index.html
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 1ed6f87473..a082f15c34 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
     && mv protoc/include/google /usr/local/include/google \
     && rm -rf protoc.zip protoc
 
+# s5cmd
+ENV S5CMD_VERSION=2.2.2
+RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
+    && chmod +x s5cmd \
+    && mv s5cmd /usr/local/bin/s5cmd
+
 # LLVM
 ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

From e8338c60f9c048e27c38fb8212ac96b542cbfcff Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 11 Apr 2024 23:42:18 -0500
Subject: [PATCH 019/157] Fix typo in pg_ctl shutdown mode (#7365)

The allowed modes as of Postgres 17 are: smart, fast, and immediate.

$ cargo neon stop
    Finished dev [unoptimized + debuginfo] target(s) in 0.24s
     Running `target/debug/neon_local stop`
postgres stop failed: pg_ctl failed, exit code: exit status: 1, stdout: , stderr: pg_ctl: unrecognized shutdown mode "fast "
Try "pg_ctl --help" for more information.
---
 control_plane/src/bin/neon_local.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 56495dd2da..68a5474c87 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
             for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
                     eprintln!("postgres stop failed: {e:#}");
                 }
             }

From 5288f9621e2c84e912ca972e3a7bbf597884be49 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 12 Apr 2024 10:15:40 +0100
Subject: [PATCH 020/157] build(deps): bump idna from 3.3 to 3.7 (#7367)

---
 poetry.lock | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7b49daf42a..aca88073a8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -1191,13 +1191,13 @@ files = [
 
 [[package]]
 name = "idna"
-version = "3.3"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
 [[package]]
@@ -2182,6 +2182,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2652,6 +2653,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},

From 83cdbbb89aa939a54c8388cfc4b0294831626467 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 15 Apr 2024 13:50:26 +0300
Subject: [PATCH 021/157] pageserver: improve readability of shard.rs (#7330)

No functional changes, this is a comments/naming PR.

While merging sharding changes, some cleanup of the shard.rs types was
deferred.

In this PR:
- Rename `is_zero` to `is_shard_zero` to make clear that this method
doesn't literally mean that the entire object is zeros, just that it
refers to the 0th shard in a tenant.
- Pull definitions of types to the top of shard.rs and add a big comment
giving an overview of which type is for what.

Closes: https://github.com/neondatabase/neon/issues/6072
---
 libs/pageserver_api/src/shard.rs              | 149 +++++++++++-------
 pageserver/src/consumption_metrics.rs         |   2 +-
 pageserver/src/consumption_metrics/metrics.rs |   2 +-
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/metrics.rs                     |   2 +-
 pageserver/src/tenant.rs                      |   4 +-
 .../tenant/remote_timeline_client/upload.rs   |   2 +-
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../src/tenant/timeline/eviction_task.rs      |   2 +-
 .../walreceiver/walreceiver_connection.rs     |   2 +-
 pageserver/src/walingest.rs                   |   2 +-
 storage_controller/src/service.rs             |   6 +-
 12 files changed, 114 insertions(+), 71 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a2a9165184..c293ad705b 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -8,12 +8,89 @@ use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);
 
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
+/// and to check whether that [`ShardNumber`] is the same as the current shard.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardCount {
     pub const MAX: Self = Self(u8::MAX);
 
@@ -38,6 +115,7 @@ impl ShardCount {
         self.0
     }
 
+    ///
     pub fn is_unsharded(&self) -> bool {
         self.0 == 0
     }
@@ -53,33 +131,6 @@ impl ShardNumber {
     pub const MAX: Self = Self(u8::MAX);
 }
 
-/// TenantShardId identify the units of work for the Pageserver.
-///
-/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
-///
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// Historically, tenants could not have multiple shards, and were identified
-/// by TenantId.  To support this, TenantShardId has a special legacy
-/// mode where `shard_count` is equal to zero: this represents a single-sharded
-/// tenant which should be written as a TenantId with no suffix.
-///
-/// The human-readable encoding of TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-///
-/// Note that the binary encoding is _not_ backward compatible, because
-/// at the time sharding is introduced, there are no existing binary structures
-/// containing TenantId that we need to handle.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl TenantShardId {
     pub fn unsharded(tenant_id: TenantId) -> Self {
         Self {
@@ -111,10 +162,13 @@ impl TenantShardId {
     }
 
     /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
         self.shard_number == ShardNumber(0)
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
     }
@@ -150,9 +204,6 @@ impl TenantShardId {
     }
 }
 
-/// Formatting helper
-struct ShardSlug<'a>(&'a TenantShardId);
-
 impl<'a> std::fmt::Display for ShardSlug<'a> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
@@ -222,16 +273,6 @@ impl From<[u8; 18]> for TenantShardId {
     }
 }
 
-/// For use within the context of a particular tenant, when we need to know which
-/// shard we're dealing with, but do not need to know the full ShardIdentity (because
-/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
-/// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardIndex {
     pub fn new(number: ShardNumber, count: ShardCount) -> Self {
         Self {
@@ -246,6 +287,9 @@ impl ShardIndex {
         }
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
     }
@@ -313,6 +357,8 @@ impl Serialize for TenantShardId {
         if serializer.is_human_readable() {
             serializer.collect_str(self)
         } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
             let mut packed: [u8; 18] = [0; 18];
             packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
             packed[16] = self.shard_number.0;
@@ -390,16 +436,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 
-/// The ShardIdentity contains the information needed for one member of map
-/// to resolve a key to a shard, and then check whether that shard is ==self.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
     #[error("Invalid shard count")]
@@ -439,6 +475,9 @@ impl ShardIdentity {
         }
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.number == ShardNumber(0) && self.count == ShardCount(0)
     }
@@ -487,6 +526,8 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be ingested by this shard
+    ///
+    /// Shards must ingest _at least_ keys which return true from this check.
     pub fn is_key_local(&self, key: &Key) -> bool {
         assert!(!self.is_broken());
         if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -497,7 +538,9 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
+    /// data store, e.g. during compaction after a split.
+    ///
+    /// Shards _may_ drop keys which return false here, but are not obliged to.
     pub fn is_key_disposable(&self, key: &Key) -> bool {
         if key_is_shard0(key) {
             // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -523,7 +566,7 @@ impl ShardIdentity {
 
     /// Convenience for checking if this identity is the 0th shard in a tenant,
     /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
         self.number == ShardNumber(0)
     }
 }
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f5540e896f..62bbde42f4 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
                 continue;
             }
 
-            if !tenant_shard_id.is_zero() {
+            if !tenant_shard_id.is_shard_zero() {
                 // We only send consumption metrics from shard 0, so don't waste time calculating
                 // synthetic size on other shards.
                 continue;
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 6740c1360b..7ba2d04c4f 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
     };
 
     let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
-        if state != TenantState::Active || !id.is_zero() {
+        if state != TenantState::Active || !id.is_shard_zero() {
             None
         } else {
             tenant_manager
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 47d8ae1148..0b8c991f11 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -696,7 +696,7 @@ async fn get_lsn_by_timestamp_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
@@ -747,7 +747,7 @@ async fn get_timestamp_of_lsn_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
@@ -1086,7 +1086,7 @@ async fn tenant_size_handler(
     let headers = request.headers();
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
         )));
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3160f204e2..6755c15c30 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2089,7 +2089,7 @@ impl TimelineMetrics {
 
 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
     // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
         let tid = tenant_shard_id.tenant_id.to_string();
         let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 17ff033e00..2eac1247f7 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3190,7 +3190,7 @@ impl Tenant {
             run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
 
             // Upload the created data dir to S3
-            if self.tenant_shard_id().is_zero() {
+            if self.tenant_shard_id().is_shard_zero() {
                 self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                     .await?;
             }
@@ -3437,7 +3437,7 @@ impl Tenant {
             .store(size, Ordering::Relaxed);
 
         // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_zero());
+        debug_assert!(self.shard_identity.is_shard_zero());
 
         TENANT_SYNTHETIC_SIZE_METRIC
             .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 137fe48b73..0227331953 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
     let warn_after = 3;
     let max_attempts = 10;
     let mut prefixes = Vec::with_capacity(2);
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
         // Also recover the unsharded prefix for a shard of zero:
         // - if the tenant is totally unsharded, the unsharded prefix contains all the data
         // - if the tenant is sharded, we still want to recover the initdb data, but we only
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d046a60af4..46b3d41e2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1344,7 +1344,7 @@ impl Timeline {
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
-        if self.tenant_shard_id.is_zero() {
+        if self.tenant_shard_id.is_shard_zero() {
             // Logical size is only maintained accurately on shard zero.
             self.spawn_initial_logical_size_computation_task(ctx);
         }
@@ -2237,7 +2237,7 @@ impl Timeline {
         priority: GetLogicalSizePriority,
         ctx: &RequestContext,
     ) -> logical_size::CurrentLogicalSize {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
             // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
             // when HTTP API is serving a GET for timeline zero, return zero
             return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
@@ -2533,7 +2533,7 @@ impl Timeline {
         crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
         // We should never be calculating logical sizes on shard !=0, because these shards do not have
         // accurate relation sizes, and they do not emit consumption metrics.
-        debug_assert!(self.tenant_shard_id.is_zero());
+        debug_assert!(self.tenant_shard_id.is_shard_zero());
 
         let guard = self
             .gate
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 522c5b57de..304d0d60ee 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -378,7 +378,7 @@ impl Timeline {
         gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
             // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
             // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
             // skip imitating logical size accesses for eviction purposes.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3f3419e886..c6ee6b90c4 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+            let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
                 timeline
                     .get_current_logical_size(
                         crate::tenant::timeline::GetLogicalSizePriority::User,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9c7e8748d5..4f83b118ae 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -403,7 +403,7 @@ impl WalIngest {
             );
 
             if !key_is_local {
-                if self.shard.is_zero() {
+                if self.shard.is_shard_zero() {
                     // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
                     // its blkno in case it implicitly extends a relation.
                     self.observe_decoded_block(modification, blk, ctx).await?;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 010558b797..4ee189dac9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2744,7 +2744,7 @@ impl Service {
         let mut describe_shards = Vec::new();
 
         for shard in shards {
-            if shard.tenant_shard_id.is_zero() {
+            if shard.tenant_shard_id.is_shard_zero() {
                 shard_zero = Some(shard);
             }
 
@@ -4084,7 +4084,7 @@ impl Service {
 
         let mut reconciles_spawned = 0;
         for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_zero() {
+            if tenant_shard_id.is_shard_zero() {
                 schedule_context = ScheduleContext::default();
             }
 
@@ -4134,7 +4134,7 @@ impl Service {
         let mut work = Vec::new();
 
         for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_zero() {
+            if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
                 schedule_context = ScheduleContext::default();
                 tenant_shards.clear();

From f752c40f58dc854a9b0ba9a03164e8d91e95b5b3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Apr 2024 16:05:44 +0300
Subject: [PATCH 022/157]  storage release: stop using no-op deployProxy /
 deployPgSniRouter (#7382)

As of https://github.com/neondatabase/aws/pull/1264
these options are no-ops.

This PR unblocks removal of the variables in
https://github.com/neondatabase/aws/pull/1263
---
 .github/workflows/build_and_test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 36922d5294..1d35fa9223 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1133,8 +1133,6 @@ jobs:
               -f deployPreprodRegion=true
 
             gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
               -f deployStorage=true \
               -f deployStorageBroker=true \
               -f deployStorageController=true \

From 110282ee7ea43f1aef4164fa947382d9801e11a0 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 15 Apr 2024 20:21:50 +0200
Subject: [PATCH 023/157] proxy: Exclude private ip errors from recorded
 metrics (#7389)

## Problem

Right now we record errors from internal VPC.

## Summary of changes

* Exclude it from the metrics.
* Simplify pg-sni-router
---
 proxy/src/bin/pg_sni_router.rs        | 27 +++++++++++++--------------
 proxy/src/context.rs                  | 12 +++++++++++-
 proxy/src/proxy.rs                    |  4 +++-
 proxy/src/proxy/copy_bidirectional.rs |  2 +-
 proxy/src/proxy/handshake.rs          |  5 ++++-
 proxy/src/proxy/tests.rs              |  2 +-
 proxy/src/proxy/tests/mitm.rs         |  5 ++++-
 proxy/src/serverless.rs               | 12 ++++++++++--
 proxy/src/stream.rs                   | 12 ++++++++++--
 9 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 58737efe46..7a693002a8 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,15 +9,13 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
-use proxy::proxy::run_until_cancelled;
-use proxy::{BranchId, EndpointId, ProjectId};
+use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::TryFutureExt;
-use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
 
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -204,6 +202,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &mut RequestMonitoring,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -233,7 +232,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
             }
 
             Ok(Stream::Tls {
-                tls: Box::new(raw.upgrade(tls_config).await?),
+                tls: Box::new(
+                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
+                        .await?,
+                ),
                 tls_server_end_point,
             })
         }
@@ -256,7 +258,7 @@ async fn handle_client(
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
 
     // Cut off first part of the SNI domain
     // We receive required destination details in the format of
@@ -273,18 +275,15 @@ async fn handle_client(
 
     info!("destination: {}", destination);
 
-    let client = tokio::net::TcpStream::connect(destination).await?;
-
-    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
-        endpoint_id: (&EndpointId::from("")).into(),
-        project_id: (&ProjectId::from("")).into(),
-        branch_id: (&BranchId::from("")).into(),
-        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
-    };
+    let mut client = tokio::net::TcpStream::connect(destination).await?;
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
     ctx.log();
 
-    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
+
+    Ok(())
 }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index dc475d57ed..d7b5be5534 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -164,8 +164,18 @@ impl RequestMonitoring {
         self.auth_method = Some(auth_method);
     }
 
+    pub fn has_private_peer_addr(&self) -> bool {
+        match self.peer_addr {
+            IpAddr::V4(ip) => ip.is_private(),
+            _ => false,
+        }
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        Metrics::get().proxy.errors_total.inc(kind);
+        // Do not record errors from the private address to metrics.
+        if !self.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
         if let Some(ep) = &self.endpoint_id {
             let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
             let label = metric.with_labels(kind);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 42fb10b326..f80ced91c8 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ pub mod handshake;
 pub mod passthrough;
 pub mod retry;
 pub mod wake_compute;
+pub use copy_bidirectional::copy_bidirectional_client_compute;
 
 use crate::{
     auth,
@@ -256,8 +257,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let tls = config.tls_config.as_ref();
 
+    let record_handshake_error = !ctx.has_private_peer_addr();
     let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls));
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 684be74f9a..4b09ebd8dc 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -41,7 +41,7 @@ where
 }
 
 #[tracing::instrument(skip_all)]
-pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
+pub async fn copy_bidirectional_client_compute<Client, Compute>(
     client: &mut Client,
     compute: &mut Compute,
 ) -> Result<(u64, u64), std::io::Error>
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 4665e07d23..dd935cc245 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -63,6 +63,7 @@ pub enum HandshakeData<S> {
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mut tls: Option<&TlsConfig>,
+    record_handshake_error: bool,
 ) -> Result<HandshakeData<S>, HandshakeError> {
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
@@ -95,7 +96,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         if !read_buf.is_empty() {
                             return Err(HandshakeError::EarlyData);
                         }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+                        let tls_stream = raw
+                            .upgrade(tls.to_server_config(), record_handshake_error)
+                            .await?;
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 71d85e106d..849e9bd33c 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -175,7 +175,7 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let client = WithClientIp::new(client);
-    let mut stream = match handshake(client, tls.as_ref()).await? {
+    let mut stream = match handshake(client, tls.as_ref(), false).await? {
         HandshakeData::Startup(stream, _) => stream,
         HandshakeData::Cancel(_) => bail!("cancellation not supported"),
     };
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 3b760e5dab..cbfc9f1358 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,7 +34,10 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
+        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
+            .await
+            .unwrap()
+        {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(_) => panic!("cancellation not supported"),
         };
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 24c94fadd8..f3c42cdb01 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -172,6 +172,10 @@ async fn connection_handler(
     };
 
     let peer_addr = peer.unwrap_or(peer_addr).ip();
+    let has_private_peer_addr = match peer_addr {
+        IpAddr::V4(ip) => ip.is_private(),
+        _ => false,
+    };
     info!(?session_id, %peer_addr, "accepted new TCP connection");
 
     // try upgrade to TLS, but with a timeout.
@@ -182,13 +186,17 @@ async fn connection_handler(
         }
         // The handshake failed
         Ok(Err(e)) => {
-            Metrics::get().proxy.tls_handshake_failures.inc();
+            if !has_private_peer_addr {
+                Metrics::get().proxy.tls_handshake_failures.inc();
+            }
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
         // The handshake timed out
         Err(e) => {
-            Metrics::get().proxy.tls_handshake_failures.inc();
+            if !has_private_peer_addr {
+                Metrics::get().proxy.tls_handshake_failures.inc();
+            }
             warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
             return;
         }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index fdd2be3ee5..690e92ffb1 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -223,12 +223,20 @@ pub enum StreamUpgradeError {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
     /// If possible, upgrade raw stream into a secure TLS-based stream.
-    pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
+    pub async fn upgrade(
+        self,
+        cfg: Arc<ServerConfig>,
+        record_handshake_error: bool,
+    ) -> Result<TlsStream<S>, StreamUpgradeError> {
         match self {
             Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
                 .accept(raw)
                 .await
-                .inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
+                .inspect_err(|_| {
+                    if record_handshake_error {
+                        Metrics::get().proxy.tls_handshake_failures.inc()
+                    }
+                })?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }

From 2d5a8462c8093fb7db7e15cea68c6d740818c39c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Apr 2024 22:14:42 +0200
Subject: [PATCH 024/157] add `async` walredo mode (disabled-by-default, opt-in
 via config) (#6548)

Before this PR, the `nix::poll::poll` call would stall the executor.

This PR refactors the `walredo::process` module to allow for different
implementations, and adds a new `async` implementation which uses
`tokio::process::ChildStd{in,out}` for IPC.

The `sync` variant remains the default for now; we'll do more testing in
staging and gradual rollout to prod using the config variable.

Performance
-----------

I updated `bench_walredo.rs`, demonstrating that a single `async`-based
walredo manager used by N=1...128 tokio tasks has lower latency and
higher throughput.

I further did manual less-micro-benchmarking in the real pageserver
binary.
Methodology & results are published here:

https://neondatabase.notion.site/2024-04-08-async-walredo-benchmarking-8c0ed3cc8d364a44937c4cb50b6d7019?pvs=4

tl;dr:
- use pagebench against a pageserver patched to answer getpage request &
small-enough working set to fit into PS PageCache / kernel page cache.
- compare knee in the latency/throughput curve
    - N tenants, each 1 pagebench clients
    - sync better throughput at N < 30, async better at higher N
    - async generally noticable but not much worse p99.X tail latencies
- eyeballing CPU efficiency in htop, `async` seems significantly more
CPU efficient at ca N=[0.5*ncpus, 1.5*ncpus], worse than `sync` outside
of that band

Mental Model For Walredo & Scheduler Interactions
-------------------------------------------------

Walredo is CPU-/DRAM-only work.
This means that as soon as the Pageserver writes to the pipe, the
walredo process becomes runnable.

To the Linux kernel scheduler, the `$ncpus` executor threads and the
walredo process thread are just `struct task_struct`, and it will divide
CPU time fairly among them.

In `sync` mode, there are always `$ncpus` runnable `struct task_struct`
because the executor thread blocks while `walredo` runs, and the
executor thread becomes runnable when the `walredo` process is done
handling the request.
In `async` mode, the executor threads remain runnable unless there are
no more runnable tokio tasks, which is unlikely in a production
pageserver.

The above means that in `sync` mode, there is an implicit concurrency
limit on concurrent walredo requests (`$num_runtimes *
$num_executor_threads_per_runtime`).
And executor threads do not compete in the Linux kernel scheduler for
CPU time, due to the blocked-runnable-ping-pong.
In `async` mode, there is no concurrency limit, and the walredo tasks
compete with the executor threads for CPU time in the kernel scheduler.

If we're not CPU-bound, `async` has a pipelining and hence throughput
advantage over `sync` because one executor thread can continue
processing requests while a walredo request is in flight.

If we're CPU-bound, under a fair CPU scheduler, the *fixed* number of
executor threads has to share CPU time with the aggregate of walredo
processes.
It's trivial to reason about this in `sync` mode due to the
blocked-runnable-ping-pong.
In `async` mode, at 100% CPU, the system arrives at some (potentially
sub-optiomal) equilibrium where the executor threads get just enough CPU
time to fill up the remaining CPU time with runnable walredo process.

Why `async` mode Doesn't Limit Walredo Concurrency
--------------------------------------------------

To control that equilibrium in `async` mode, one may add a tokio
semaphore to limit the number of in-flight walredo requests.
However, the placement of such a semaphore is non-trivial because it
means that tasks queuing up behind it hold on to their request-scoped
allocations.
In the case of walredo, that might be the entire reconstruct data.
We don't limit the number of total inflight Timeline::get (we only
throttle admission).
So, that queue might lead to an OOM.

The alternative is to acquire the semaphore permit *before* collecting
reconstruct data.
However, what if we need to on-demand download?

A combination of semaphores might help: one for reconstruct data, one
for walredo.
The reconstruct data semaphore permit is dropped after acquiring the
walredo semaphore permit.
This scheme effectively enables both a limit on in-flight reconstruct
data and walredo concurrency.

However, sizing the amount of permits for the semaphores is tricky:
- Reconstruct data retrieval is a mix of disk IO and CPU work.
- If we need to do on-demand downloads, it's network IO + disk IO + CPU
work.
- At this time, we have no good data on how the wall clock time is
distributed.

It turns out that, in my benchmarking, the system worked fine without a
semaphore. So, we're shipping async walredo without one for now.

Future Work
-----------

We will do more testing of `async` mode and gradual rollout to prod
using the config flag.
Once that is done, we'll remove `sync` mode to avoid the temporary code
duplication introduced by this PR.
The flag will be removed.

The `wait()` for the child process to exit is still synchronous; the
comment [here](
https://github.com/neondatabase/neon/blob/655d3b64681b6562530665c9ab5f2f806f30ad01/pageserver/src/walredo.rs#L294-L306)
is still a valid argument in favor of that.

The `sync` mode had another implicit advantage: from tokio's
perspective, the calling task was using up coop budget.
But with `async` mode, that's no longer the case -- to tokio, the writes
to the child process pipe look like IO.
We could/should inform tokio about the CPU time budget consumed by the
task to achieve fairness similar to `sync`.
However, the [runtime function for this is
`tokio_unstable`](`https://docs.rs/tokio/latest/tokio/task/fn.consume_budget.html).


Refs
----

refs #6628
refs https://github.com/neondatabase/neon/issues/2975
---
 libs/pageserver_api/src/models.rs             |  10 +-
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/poison.rs                      | 121 +++++
 pageserver/benches/bench_walredo.rs           | 147 ++++--
 pageserver/src/bin/pageserver.rs              |   1 +
 pageserver/src/config.rs                      |  25 +-
 pageserver/src/metrics.rs                     |  23 +
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/walredo.rs                     |  65 +--
 pageserver/src/walredo/process.rs             | 435 +++---------------
 .../process/process_impl/process_async.rs     | 374 +++++++++++++++
 .../process/process_impl/process_std.rs       | 405 ++++++++++++++++
 test_runner/regress/test_pageserver_config.py |  35 ++
 13 files changed, 1187 insertions(+), 458 deletions(-)
 create mode 100644 libs/utils/src/poison.rs
 create mode 100644 pageserver/src/walredo/process/process_impl/process_async.rs
 create mode 100644 pageserver/src/walredo/process/process_impl/process_std.rs
 create mode 100644 test_runner/regress/test_pageserver_config.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b4909f247f..f441d1ff1a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -747,10 +747,18 @@ pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerProcessStatus {
+    pub pid: u32,
+    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
+    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
+    pub kind: Cow<'static, str>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
     pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub pid: Option<u32>,
+    pub process: Option<WalRedoManagerProcessStatus>,
 }
 
 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b09350d11e..2953f0aad4 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -92,6 +92,8 @@ pub mod zstd;
 
 pub mod env;
 
+pub mod poison;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
new file mode 100644
index 0000000000..0bf5664f47
--- /dev/null
+++ b/libs/utils/src/poison.rs
@@ -0,0 +1,121 @@
+//!  Protect a piece of state from reuse after it is left in an inconsistent state.
+//!
+//!  # Example
+//!
+//!  ```
+//!  # tokio_test::block_on(async {
+//!  use utils::poison::Poison;
+//!  use std::time::Duration;
+//!
+//!  struct State {
+//!    clean: bool,
+//!  }
+//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
+//!
+//!  let mut mutex_guard = state.lock().await;
+//!  let mut poison_guard = mutex_guard.check_and_arm()?;
+//!  let state = poison_guard.data_mut();
+//!  state.clean = false;
+//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
+//!  tokio::time::sleep(Duration::from_secs(10)).await;
+//!  state.clean = true;
+//!  poison_guard.disarm();
+//!  # Ok::<(), utils::poison::Error>(())
+//!  # });
+//!  ```
+
+use tracing::warn;
+
+pub struct Poison<T> {
+    what: &'static str,
+    state: State,
+    data: T,
+}
+
+#[derive(Clone, Copy)]
+enum State {
+    Clean,
+    Armed,
+    Poisoned { at: chrono::DateTime<chrono::Utc> },
+}
+
+impl<T> Poison<T> {
+    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
+    pub fn new(what: &'static str, data: T) -> Self {
+        Self {
+            what,
+            state: State::Clean,
+            data,
+        }
+    }
+
+    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
+    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
+        match self.state {
+            State::Clean => {
+                self.state = State::Armed;
+                Ok(Guard(self))
+            }
+            State::Armed => unreachable!("transient state"),
+            State::Poisoned { at } => Err(Error::Poisoned {
+                what: self.what,
+                at,
+            }),
+        }
+    }
+}
+
+/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
+/// Once modifications are done, use [`Self::disarm`].
+/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
+/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
+pub struct Guard<'a, T>(&'a mut Poison<T>);
+
+impl<'a, T> Guard<'a, T> {
+    pub fn data(&self) -> &T {
+        &self.0.data
+    }
+    pub fn data_mut(&mut self) -> &mut T {
+        &mut self.0.data
+    }
+
+    pub fn disarm(self) {
+        match self.0.state {
+            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
+            State::Armed => {
+                self.0.state = State::Clean;
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+impl<'a, T> Drop for Guard<'a, T> {
+    fn drop(&mut self) {
+        match self.0.state {
+            State::Clean => {
+                // set by disarm()
+            }
+            State::Armed => {
+                // still armed => poison it
+                let at = chrono::Utc::now();
+                self.0.state = State::Poisoned { at };
+                warn!(at=?at, "poisoning {}", self.0.what);
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("poisoned at {at}: {what}")]
+    Poisoned {
+        what: &'static str,
+        at: chrono::DateTime<chrono::Utc>,
+    },
+}
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ffe607be4b..5b871c5d5e 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,30 +27,50 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-04 on i3en.3xlarge
+//! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
+//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
+//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
+//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
+//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
+//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
+//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
+//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
+//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
+//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
+//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
+//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
+//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
+//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
+//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
+//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
+//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{
+    config::PageServerConf,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, ProcessKind},
+};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     sync::Arc,
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-short"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::short_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
         }
-    }
 
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::medium_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
         }
     }
 }
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl(
+    process_kind: ProcessKind,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+    nclients: u64,
+) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
-    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    conf.walredo_process_kind = process_kind;
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
     let manager = PostgresRedoManager::new(conf, tenant_shard_id);
     let manager = Arc::new(manager);
 
+    // divide the amount of work equally among the clients.
+    let nredos_per_client = n_redos / nclients;
     for _ in 0..nclients {
         rt.block_on(async {
             tasks.spawn(client(
                 Arc::clone(&manager),
                 Arc::clone(&start),
                 Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
+                nredos_per_client,
             ))
         });
     }
 
-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+    let elapsed = rt.block_on(async move {
+        let mut total_wallclock_time = Duration::ZERO;
         while let Some(res) = tasks.join_next().await {
             total_wallclock_time += res.unwrap();
         }
         total_wallclock_time
-    })
+    });
+
+    // consistency check to ensure process kind setting worked
+    if nredos_per_client > 0 {
+        assert_eq!(
+            manager
+                .status()
+                .process
+                .map(|p| p.kind)
+                .expect("the benchmark work causes a walredo process to be spawned"),
+            std::borrow::Cow::Borrowed(process_kind.into())
+        );
+    }
+
+    elapsed
 }
 
 async fn client(
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 0903b206ff..41835f9843 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -285,6 +285,7 @@ fn start_pageserver(
     ))
     .unwrap();
     pageserver::preinitialize_metrics();
+    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1837da34ce..e10db2b853 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -97,6 +97,8 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -140,6 +142,8 @@ pub mod defaults {
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
 
+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -290,6 +294,8 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }
 
 impl PageServerConfigBuilder {
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
             )),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
         }
     }
 }
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
             }
             CUSTOM LOGIC
             {
@@ -1032,6 +1047,9 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1114,6 +1132,7 @@ impl PageServerConf {
             ),
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
         }
     }
 }
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6755c15c30..be61a755ff 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1819,6 +1819,29 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
+#[cfg(not(test))]
+pub mod wal_redo {
+    use super::*;
+
+    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
+        std::sync::Mutex::new(
+            register_uint_gauge_vec!(
+                "pageserver_wal_redo_process_kind",
+                "The configured process kind for walredo",
+                &["kind"],
+            )
+            .unwrap(),
+        )
+    });
+
+    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
+        // use guard to avoid races around the next two steps
+        let guard = PROCESS_KIND.lock().unwrap();
+        guard.reset();
+        guard.with_label_values(&[&format!("{kind}")]).set(1);
+    }
+}
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2eac1247f7..35ea037a55 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -386,7 +386,7 @@ impl WalRedoManager {
 
     pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
         match self {
-            WalRedoManager::Prod(m) => m.status(),
+            WalRedoManager::Prod(m) => Some(m.status()),
             #[cfg(test)]
             WalRedoManager::Test(_) => None,
         }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index ca41a576fd..9776d4ce88 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@
 
 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
+pub use process::Kind as ProcessKind;
 
 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
-use pageserver_api::models::WalRedoManagerStatus;
+use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
 use std::time::Duration;
@@ -54,7 +55,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// The current [`process::Process`] that is used by new redo requests.
     /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
     /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
@@ -66,7 +67,7 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
 }
 
 ///
@@ -139,8 +140,8 @@ impl PostgresRedoManager {
         }
     }
 
-    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
-        Some(WalRedoManagerStatus {
+    pub fn status(&self) -> WalRedoManagerStatus {
+        WalRedoManagerStatus {
             last_redo_at: {
                 let at = *self.last_redo_at.lock().unwrap();
                 at.and_then(|at| {
@@ -149,8 +150,14 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            pid: self.redo_process.get().map(|p| p.id()),
-        })
+            process: self
+                .redo_process
+                .get()
+                .map(|p| WalRedoManagerProcessStatus {
+                    pid: p.id(),
+                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
+                }),
+        }
     }
 }
 
@@ -208,37 +215,33 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
+            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => Arc::clone(&guard),
+                Err(permit) => {
+                    // don't hold poison_guard, the launch code can bail
+                    let start = Instant::now();
+                    let proc = Arc::new(
+                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
                             .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                    );
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process.set(Arc::clone(&proc), permit);
+                    proc
+                }
+            };
 
             let started_at = std::time::Instant::now();
 
             // Relational WAL records are applied using wal-redo-postgres
             let result = proc
                 .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .await
                 .context("apply_wal_records");
 
             let duration = started_at.elapsed();
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index bcbb263663..ad6b4e5fe9 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,186 +1,67 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
+use std::time::Duration;
+
 use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
+use utils::lsn::Lsn;
+
+use crate::{config::PageServerConf, walrecord::NeonWalRecord};
 
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;
 
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
+mod process_impl {
+    pub(super) mod process_async;
+    pub(super) mod process_std;
 }
 
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
+#[derive(
+    Clone,
+    Copy,
+    Debug,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum Kind {
+    Sync,
+    Async,
 }
 
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
+pub(crate) enum Process {
+    Sync(process_impl::process_std::WalRedoProcess),
+    Async(process_impl::process_async::WalRedoProcess),
 }
 
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
+impl Process {
+    #[inline(always)]
+    pub fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-        async move {
-            scopeguard::defer! {
-                debug!("wal-redo-postgres stderr_logger_task finished");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-            }
-            debug!("wal-redo-postgres stderr_logger_task started");
-            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-            use tokio::io::AsyncBufReadExt;
-            let mut stderr_lines = tokio::io::BufReader::new(stderr);
-            let mut buf = Vec::new();
-            let res = loop {
-                buf.clear();
-                // TODO we don't trust the process to cap its stderr length.
-                // Currently it can do unbounded Vec allocation.
-                match stderr_lines.read_until(b'\n', &mut buf).await {
-                    Ok(0) => break Ok(()), // eof
-                    Ok(num_bytes) => {
-                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                        error!(%output, "received output");
-                    }
-                    Err(e) => {
-                        break Err(e);
-                    }
-                }
-            };
-            match res {
-                Ok(()) => (),
-                Err(e) => {
-                    error!(error=?e, "failed to read from walredo stderr");
-                }
-            }
-        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-    );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
+        Ok(match conf.walredo_process_kind {
+            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
+            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
         })
     }
 
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) fn apply_wal_records(
+    #[inline(always)]
+    pub(crate) async fn apply_wal_records(
         &self,
         rel: RelTag,
         blknum: u32,
@@ -188,221 +69,29 @@ impl WalRedoProcess {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+        match self {
+            Process::Sync(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
             }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
+            Process::Async(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
             }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
         }
     }
 
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
+    pub(crate) fn id(&self) -> u32 {
+        match self {
+            Process::Sync(p) => p.id(),
+            Process::Async(p) => p.id(),
+        }
+    }
 
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
+    pub(crate) fn kind(&self) -> Kind {
+        match self {
+            Process::Sync(_) => Kind::Sync,
+            Process::Async(_) => Kind::Async,
+        }
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs
new file mode 100644
index 0000000000..262858b033
--- /dev/null
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -0,0 +1,374 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs
new file mode 100644
index 0000000000..e7a6c263c9
--- /dev/null
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -0,0 +1,405 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
new file mode 100644
index 0000000000..c04348b488
--- /dev/null
+++ b/test_runner/regress/test_pageserver_config.py
@@ -0,0 +1,35 @@
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+)
+
+
+@pytest.mark.parametrize("kind", ["sync", "async"])
+def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
+    neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
+    # ensure it starts
+    env = neon_env_builder.init_start()
+    # ensure the metric is set
+    ps_http = env.pageserver.http_client()
+    metrics = ps_http.get_metrics()
+    samples = metrics.query_all("pageserver_wal_redo_process_kind")
+    assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
+    # ensure default tenant's config kind matches
+    # => write some data to force-spawn walredo
+    ep = env.endpoints.create_start("main")
+    with ep.connect() as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table foo(bar text)")
+            cur.execute("insert into foo select from generate_series(1, 100)")
+    last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
+    ep.stop()
+    ep.start()
+    with ep.connect() as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from foo")
+            [(count,)] = cur.fetchall()
+            assert count == 100
+
+    status = ps_http.tenant_status(env.initial_tenant)
+    assert status["walredo"]["process"]["kind"] == kind

From 3366cd34bacfbd2dab57378494eee0d3a21d3079 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Apr 2024 11:39:18 +0300
Subject: [PATCH 025/157] pageserver: return ACCEPTED when deletion already in
 flight (#7384)

## Problem

test_sharding_smoke recently got an added section that checks deletion
of a sharded tenant. The storage controller does a retry loop for
deletion, waiting for a 404 response. When deletion is a bit slow (debug
builds), the retry of deletion was getting a 500 response -- this caused
the test to become flaky (example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/release-proxy/8659801445/index.html#testresult/b4cbf5b58190f60e/retries)

There was a false comment in the code:
```
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
```

If the tenant is stopping, DeleteTenantFlow does not in fact handle it,
but returns a 500-yielding errror.

## Summary of changes

Before calling into DeleteTenantFlow, if the tenant is in
stopping|broken state then return 202 if a deletion is in progress. This
makes the API friendlier for retries.

The historic AlreadyInProgress (409) response still exists for if we
enter DeleteTenantFlow and unexpectedly see the tenant stopping. That
should go away when we implement #5080 . For the moment, callers that
handle 409s should continue to do so.
---
 pageserver/src/tenant/delete.rs           |  5 +++++
 pageserver/src/tenant/mgr.rs              | 12 ++++++++---
 test_runner/fixtures/neon_fixtures.py     |  4 +++-
 test_runner/regress/test_tenant_delete.py | 26 ++++++++++++-----------
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index d1881f3897..33d0f677e5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -436,6 +436,11 @@ impl DeleteTenantFlow {
         .await
     }
 
+    /// Check whether background deletion of this tenant is currently in progress
+    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
+        tenant.delete_progress.try_lock().is_err()
+    }
+
     async fn prepare(
         tenant: &Arc<Tenant>,
     ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b1b46d487b..73967f2949 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1410,9 +1410,15 @@ impl TenantManager {
 
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
+                // If deletion is already in progress, return success (the semantics of this
+                // function are to rerturn success afterr deletion is spawned in background).
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
+                if DeleteTenantFlow::is_in_progress(&tenant) {
+                    // The `delete_progress` lock is held: deletion is already happening
+                    // in the bacckground
+                    slot_guard.revert();
+                    return Ok(());
+                }
             }
             _ => {
                 tenant
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0e4a58c099..c2c661088b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2449,10 +2449,12 @@ class NeonPageserver(PgProtocol):
                 if cur_line_no < skip_until_line_no:
                     cur_line_no += 1
                     continue
-                if contains_re.search(line):
+                elif contains_re.search(line):
                     # found it!
                     cur_line_no += 1
                     return (line, LogCursor(cur_line_no))
+                else:
+                    cur_line_no += 1
         return None
 
     def tenant_attach(
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index a164c7f60a..c115c0375b 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -469,7 +469,8 @@ def test_tenant_delete_concurrent(
 ):
     """
     Validate that concurrent delete requests to the same tenant behave correctly:
-    exactly one should succeed.
+    exactly one should execute: the rest should give 202 responses but not start
+    another deletion.
 
     This is a reproducer for https://github.com/neondatabase/neon/issues/5936
     """
@@ -484,14 +485,10 @@ def test_tenant_delete_concurrent(
         run_pg_bench_small(pg_bin, endpoint.connstr())
         last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
 
-    CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken"
-
     env.pageserver.allowed_errors.extend(
         [
             # lucky race with stopping from flushing a layer we fail to schedule any uploads
             ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
-            # Errors logged from our 4xx requests
-            f".*{CONFLICT_MESSAGE}.*",
         ]
     )
 
@@ -507,7 +504,7 @@ def test_tenant_delete_concurrent(
         return ps_http.tenant_delete(tenant_id)
 
     def hit_remove_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
 
     def hit_run_failpoint():
         env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
@@ -518,11 +515,14 @@ def test_tenant_delete_concurrent(
 
         # Wait until the first request completes its work and is blocked on removing
         # the TenantSlot from tenant manager.
-        wait_until(100, 0.1, hit_remove_failpoint)
+        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
+        assert log_cursor is not None
 
-        # Start another request: this should fail when it sees a tenant in Stopping state
-        with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
-            ps_http.tenant_delete(tenant_id)
+        # Start another request: this should succeed without actually entering the deletion code
+        ps_http.tenant_delete(tenant_id)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
 
         # Start another background request, which will pause after acquiring a TenantSlotGuard
         # but before completing.
@@ -539,8 +539,10 @@ def test_tenant_delete_concurrent(
 
         # Permit the duplicate background request to run to completion and fail.
         ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
-        with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
-            background_4xx_req.result(timeout=10)
+        background_4xx_req.result(timeout=10)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
 
     # Physical deletion should have happened
     assert_prefix_empty(

From 926662eb7ca12956d7210c97f28ba744b43aa30f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 16 Apr 2024 13:41:48 +0100
Subject: [PATCH 026/157] storage_controller: suppress misleading log (#7395)

## Problem

- https://github.com/neondatabase/neon/issues/7355

The optimize_secondary function calls schedule_shard to check for
improvements, but if there are exactly the same number of nodes as there
are replicas of the shard, it emits some scary looking logs about no
nodes being elegible.

Closes https://github.com/neondatabase/neon/issues/7355

## Summary of changes

- Add a mode to SchedulingContext that controls logging: this should be
useful in future any time we add a log to the scheduling path, to avoid
it becoming a source of spam when the scheduler is called during
optimization.
---
 storage_controller/src/scheduler.rs | 43 ++++++++++++++++++++++-------
 storage_controller/src/service.rs   |  3 +-
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 862ac0cbfe..3ff0d87988 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -84,6 +84,20 @@ impl std::ops::Add for AffinityScore {
     }
 }
 
+/// Hint for whether this is a sincere attempt to schedule, or a speculative
+/// check for where we _would_ schedule (done during optimization)
+#[derive(Debug)]
+pub(crate) enum ScheduleMode {
+    Normal,
+    Speculative,
+}
+
+impl Default for ScheduleMode {
+    fn default() -> Self {
+        Self::Normal
+    }
+}
+
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
@@ -93,6 +107,8 @@ pub(crate) struct ScheduleContext {
 
     /// Specifically how many _attached_ locations are on each node
     pub(crate) attached_nodes: HashMap<NodeId, usize>,
+
+    pub(crate) mode: ScheduleMode,
 }
 
 impl ScheduleContext {
@@ -329,27 +345,34 @@ impl Scheduler {
         scores.sort_by_key(|i| (i.1, i.2, i.0));
 
         if scores.is_empty() {
-            // After applying constraints, no pageservers were left.  We log some detail about
-            // the state of nodes to help understand why this happened.  This is not logged as an error because
-            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
-            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
-            for (node_id, node) in &self.nodes {
+            // After applying constraints, no pageservers were left.
+            if !matches!(context.mode, ScheduleMode::Speculative) {
+                // If this was not a speculative attempt, log details to understand why we couldn't
+                // schedule: this may help an engineer understand if some nodes are marked offline
+                // in a way that's preventing progress.
                 tracing::info!(
-                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule != MaySchedule::No,
-                    node.shard_count
+                    "Scheduling failure, while excluding {hard_exclude:?}, node states:"
                 );
+                for (node_id, node) in &self.nodes {
+                    tracing::info!(
+                        "Node {node_id}: may_schedule={} shards={}",
+                        node.may_schedule != MaySchedule::No,
+                        node.shard_count
+                    );
+                }
             }
-
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
         // Lowest score wins
         let node_id = scores.first().unwrap().0;
-        tracing::info!(
+
+        if !matches!(context.mode, ScheduleMode::Speculative) {
+            tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
             scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
         );
+        }
 
         // Note that we do not update shard count here to reflect the scheduling: that
         // is IntentState's job when the scheduled location is used.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4ee189dac9..0565f8e7b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -11,7 +11,7 @@ use crate::{
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::ReconcileError,
-    scheduler::ScheduleContext,
+    scheduler::{ScheduleContext, ScheduleMode},
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -4137,6 +4137,7 @@ impl Service {
             if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
                 schedule_context = ScheduleContext::default();
+                schedule_context.mode = ScheduleMode::Speculative;
                 tenant_shards.clear();
             }
 

From e5c50bb12b8013fd671052084b02626e02081c27 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 16 Apr 2024 15:16:34 +0100
Subject: [PATCH 027/157] proxy: rate limit authentication by masked IPv6.
 (#7316)

## Problem

Many users have access to ipv6 subnets (eg a /64). That gives them 2^64
addresses to play with

## Summary of changes

Truncate the address to /64 to reduce the attack surface.

Todo:
~~Will NAT64 be an issue here? AFAIU they put the IPv4 address at the
end of the IPv6 address. By truncating we will lose all that detail.~~
It's the same problem as a host sharing IPv6 addresses between clients.
I don't think it's up to us to solve. If a customer is getting DDoSed,
then they likely need to arrange a dedicated IP with us.
---
 proxy/src/auth/backend.rs             | 112 +++++++++++++++++++++++---
 proxy/src/bin/proxy.rs                |   6 +-
 proxy/src/config.rs                   |   5 +-
 proxy/src/rate_limiter.rs             |   2 +-
 proxy/src/rate_limiter/limiter.rs     |  51 +-----------
 proxy/src/serverless/backend.rs       |   4 +-
 proxy/src/serverless/sql_over_http.rs |   4 +-
 7 files changed, 118 insertions(+), 66 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ab5dd4544b..3795e3b608 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,8 +2,15 @@ mod classic;
 mod hacks;
 mod link;
 
+use std::net::IpAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
+use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
@@ -16,6 +23,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
+use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,9 +36,6 @@ use crate::{
     stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
-use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -176,11 +181,45 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
     }
 }
 
+#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
+pub struct MaskedIp(IpAddr);
+
+impl MaskedIp {
+    fn new(value: IpAddr, prefix: u8) -> Self {
+        match value {
+            IpAddr::V4(v4) => Self(IpAddr::V4(
+                Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
+            )),
+            IpAddr::V6(v6) => Self(IpAddr::V6(
+                Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
+            )),
+        }
+    }
+}
+
+// This can't be just per IP because that would limit some PaaS that share IP addresses
+pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
+
+impl RateBucketInfo {
+    /// All of these are per endpoint-maskedip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 1000mcpus total per endpoint-ip pair
+    /// * 4096000 requests per second with 1 hash rounds.
+    /// * 1000 requests per second with 4096 hash rounds.
+    /// * 6.8 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(1000 * 4096, Duration::from_secs(1)),
+        Self::new(600 * 4096, Duration::from_secs(60)),
+        Self::new(300 * 4096, Duration::from_secs(600)),
+    ];
+}
+
 impl AuthenticationConfig {
     pub fn check_rate_limit(
         &self,
-
         ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
         secret: AuthSecret,
         endpoint: &EndpointId,
         is_cleartext: bool,
@@ -201,9 +240,13 @@ impl AuthenticationConfig {
             1
         };
 
-        let limit_not_exceeded = self
-            .rate_limiter
-            .check((endpoint_int, ctx.peer_addr), password_weight);
+        let limit_not_exceeded = self.rate_limiter.check(
+            (
+                endpoint_int,
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+            ),
+            password_weight,
+        );
 
         if !limit_not_exceeded {
             warn!(
@@ -271,6 +314,7 @@ async fn auth_quirks(
     let secret = match secret {
         Some(secret) => config.check_rate_limit(
             ctx,
+            config,
             secret,
             &info.endpoint,
             unauthenticated_password.is_some() || allow_cleartext,
@@ -473,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
+    use std::{net::IpAddr, sync::Arc, time::Duration};
 
     use bytes::BytesMut;
     use fallible_iterator::FallibleIterator;
@@ -486,7 +530,7 @@ mod tests {
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
     use crate::{
-        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
         config::AuthenticationConfig,
         console::{
             self,
@@ -495,12 +539,12 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
-        rate_limiter::{AuthRateLimiter, RateBucketInfo},
+        rate_limiter::RateBucketInfo,
         scram::ServerSecret,
         stream::{PqStream, Stream},
     };
 
-    use super::auth_quirks;
+    use super::{auth_quirks, AuthRateLimiter};
 
     struct Auth {
         ips: Vec<IpPattern>,
@@ -541,6 +585,7 @@ mod tests {
         scram_protocol_timeout: std::time::Duration::from_secs(5),
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+        rate_limit_ip_subnet: 64,
     });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
@@ -552,6 +597,51 @@ mod tests {
         }
     }
 
+    #[test]
+    fn masked_ip() {
+        let ip_a = IpAddr::V4([127, 0, 0, 1].into());
+        let ip_b = IpAddr::V4([127, 0, 0, 2].into());
+        let ip_c = IpAddr::V4([192, 168, 1, 101].into());
+        let ip_d = IpAddr::V4([192, 168, 1, 102].into());
+        let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
+        let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
+
+        assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
+        assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
+        assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
+        assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
+
+        assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
+        assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
+    }
+
+    #[test]
+    fn test_default_auth_rate_limit_set() {
+        // these values used to exceed u32::MAX
+        assert_eq!(
+            RateBucketInfo::DEFAULT_AUTH_SET,
+            [
+                RateBucketInfo {
+                    interval: Duration::from_secs(1),
+                    max_rpi: 1000 * 4096,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(60),
+                    max_rpi: 600 * 4096 * 60,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(600),
+                    max_rpi: 300 * 4096 * 600,
+                }
+            ]
+        );
+
+        for x in RateBucketInfo::DEFAULT_AUTH_SET {
+            let y = x.to_string().parse().unwrap();
+            assert_eq!(x, y);
+        }
+    }
+
     #[tokio::test]
     async fn auth_quirks_scram() {
         let (mut client, server) = tokio::io::duplex(1024);
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 2e749fc7e8..06ada991f3 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::AuthRateLimiter;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
@@ -20,7 +21,6 @@ use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
 use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
-use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
@@ -152,6 +152,9 @@ struct ProxyCliArgs {
     /// Authentication rate limiter max number of hashes per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
     auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
@@ -575,6 +578,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         scram_protocol_timeout: args.scram_protocol_timeout,
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
+        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b4b2ce8dbd..7b4c02393b 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,6 @@
 use crate::{
-    auth,
-    rate_limiter::{AuthRateLimiter, RateBucketInfo},
+    auth::{self, backend::AuthRateLimiter},
+    rate_limiter::RateBucketInfo,
     serverless::GlobalConnPoolOptions,
 };
 use anyhow::{bail, ensure, Context, Ok};
@@ -58,6 +58,7 @@ pub struct AuthenticationConfig {
     pub scram_protocol_timeout: tokio::time::Duration,
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
+    pub rate_limit_ip_subnet: u8,
 }
 
 impl TlsConfig {
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index a3b83e5e50..2a7297ef81 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
+pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 7e9370f606..a0a4e82fe5 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -2,7 +2,6 @@ use std::{
     borrow::Cow,
     collections::hash_map::RandomState,
     hash::{BuildHasher, Hash},
-    net::IpAddr,
     sync::{
         atomic::{AtomicUsize, Ordering},
         Arc, Mutex,
@@ -18,11 +17,8 @@ use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
 use crate::{
-    intern::EndpointIdInt,
-    {
-        metrics::{Metrics, RateLimit},
-        EndpointId,
-    },
+    metrics::{Metrics, RateLimit},
+    EndpointId,
 };
 
 use super::{
@@ -81,9 +77,6 @@ impl GlobalRateLimiter {
 // I went with a more expensive way that yields user-friendlier error messages.
 pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
 
-// This can't be just per IP because that would limit some PaaS that share IP addresses
-pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
-
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
     map: DashMap<Key, Vec<RateBucket>, Hasher>,
     info: Cow<'static, [RateBucketInfo]>,
@@ -155,19 +148,6 @@ impl RateBucketInfo {
         Self::new(100, Duration::from_secs(600)),
     ];
 
-    /// All of these are per endpoint-ip pair.
-    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
-    ///
-    /// First bucket: 300mcpus total per endpoint-ip pair
-    /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
-    /// * 300 requests per second with 4096 hash rounds.
-    /// * 2 requests per second with 600000 hash rounds.
-    pub const DEFAULT_AUTH_SET: [Self; 3] = [
-        Self::new(300 * 4096, Duration::from_secs(1)),
-        Self::new(200 * 4096, Duration::from_secs(60)),
-        Self::new(100 * 4096, Duration::from_secs(600)),
-    ];
-
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -783,31 +763,4 @@ mod tests {
         }
         assert!(limiter.map.len() < 150_000);
     }
-
-    #[test]
-    fn test_default_auth_set() {
-        // these values used to exceed u32::MAX
-        assert_eq!(
-            RateBucketInfo::DEFAULT_AUTH_SET,
-            [
-                RateBucketInfo {
-                    interval: Duration::from_secs(1),
-                    max_rpi: 300 * 4096,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(60),
-                    max_rpi: 200 * 4096 * 60,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(600),
-                    max_rpi: 100 * 4096 * 600,
-                }
-            ]
-        );
-
-        for x in RateBucketInfo::DEFAULT_AUTH_SET {
-            let y = x.to_string().parse().unwrap();
-            assert_eq!(x, y);
-        }
-    }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8aa5ad4e8a..e74c63599a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -6,7 +6,7 @@ use tracing::{field::display, info};
 use crate::{
     auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
-    config::ProxyConfig,
+    config::{AuthenticationConfig, ProxyConfig},
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
         CachedNodeInfo,
@@ -27,6 +27,7 @@ impl PoolingBackend {
     pub async fn authenticate(
         &self,
         ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
         conn_info: &ConnInfo,
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
@@ -43,6 +44,7 @@ impl PoolingBackend {
         let secret = match cached_secret.value.clone() {
             Some(secret) => self.config.authentication_config.check_rate_limit(
                 ctx,
+                config,
                 secret,
                 &user_info.endpoint,
                 true,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index a66edb2c66..e856053a7e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -541,7 +541,9 @@ async fn handle_inner(
     .map_err(SqlOverHttpError::from);
 
     let authenticate_and_connect = async {
-        let keys = backend.authenticate(ctx, &conn_info).await?;
+        let keys = backend
+            .authenticate(ctx, &config.authentication_config, &conn_info)
+            .await?;
         let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
             .await?;

From 1c012958c7b350eacf94ce631e271ef7afd2a575 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 16 Apr 2024 16:24:09 +0100
Subject: [PATCH 028/157] pageserver/http: remove status code boilerplate from
 swagger spec (#7385)

## Problem
We specify a bunch of possible error codes in the pageserver api swagger
spec. This is error prone and annoying to work with.
https://github.com/neondatabase/cloud/pull/11907 introduced generic
error handling on the control plane side, so we can now clean up the
spec.

## Summary of changes
* Remove generic error codes from swagger spec
* Update a couple route handlers which would previously return an error
without a `msg` field in the response body.

Tested via https://github.com/neondatabase/cloud/pull/12340

Related https://github.com/neondatabase/cloud/issues/7238
---
 pageserver/src/http/openapi_spec.yml | 615 +--------------------------
 pageserver/src/http/routes.rs        |  10 +-
 2 files changed, 11 insertions(+), 614 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 2713309824..d89f949688 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -58,24 +58,6 @@ paths:
       responses:
         "200":
           description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}:
     parameters:
@@ -93,62 +75,14 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     delete:
       description: |
         Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
         404 means that deletion successfully finished"
       responses:
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "404":
-          description: Tenant not found
+          description: Tenant not found. This is the success path.
           content:
             application/json:
               schema:
@@ -165,18 +99,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/time_travel_remote_storage:
     parameters:
@@ -206,36 +128,6 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Error when no tenant id found in path or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
@@ -255,36 +147,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -309,60 +171,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     delete:
       description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
       responses:
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "404":
-          description: Timeline not found
+          description: Timeline not found. This is the success path.
           content:
             application/json:
               schema:
@@ -379,18 +193,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
     parameters:
@@ -423,36 +225,6 @@ paths:
               schema:
                 type: string
                 format: date-time
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found, or there is no timestamp information for the given lsn
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
     parameters:
@@ -484,36 +256,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/LsnByTimestampResponse"
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
     parameters:
@@ -537,36 +279,6 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
   /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
       - name: tenant_shard_id
@@ -628,24 +340,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantLocationConfigResponse"
-        "503":
-          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "409":
           description: |
             The tenant is already known to Pageserver in some way,
@@ -662,12 +356,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
   /v1/tenant/{tenant_id}/ignore:
     parameters:
       - name: tenant_id
@@ -684,36 +372,6 @@ paths:
       responses:
         "200":
           description: Tenant ignored
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
 
   /v1/tenant/{tenant_id}/load:
@@ -740,36 +398,6 @@ paths:
       responses:
         "202":
           description: Tenant scheduled to load successfully
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
     parameters:
@@ -790,37 +418,6 @@ paths:
       responses:
         "202":
           description: Tenant scheduled to load successfully
-        "404":
-          description: No tenant or timeline found for the specified ids
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
@@ -839,31 +436,8 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/SyntheticSizeResponse"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
+  # This route has no handler. TODO: remove?
   /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
@@ -945,18 +519,6 @@ paths:
       responses:
         "200":
           description: Success
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_shard_id}/secondary/download:
     parameters:
@@ -987,20 +549,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/SecondaryProgress"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
 
   /v1/tenant/{tenant_id}/timeline/:
     parameters:
@@ -1043,24 +591,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Malformed timeline create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "406":
           description: Permanently unsatisfiable request, don't retry.
           content:
@@ -1079,18 +609,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/:
     get:
@@ -1104,30 +622,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TenantInfo"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     post:
       description: |
@@ -1148,43 +642,12 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Malformed tenant create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "409":
           description: Tenant already exists, creation skipped
           content:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 
   /v1/tenant/config:
     put:
@@ -1206,36 +669,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Malformed tenant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/config/:
     parameters:
@@ -1255,42 +688,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantConfigResponse"
-        "400":
-          description: Malformed get tenanant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenand or timeline were not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/utilization:
     get:
@@ -1304,12 +701,6 @@ paths:
               application/json:
                 schema:
                   $ref: "#/components/schemas/PageserverUtilization"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
 components:
   securitySchemes:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0b8c991f11..20258dd950 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -457,8 +457,12 @@ async fn reload_auth_validation_keys_handler(
             json_response(StatusCode::OK, ())
         }
         Err(e) => {
+            let err_msg = "Error reloading public keys";
             warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+            json_response(
+                StatusCode::INTERNAL_SERVER_ERROR,
+                HttpErrorBody::from_msg(err_msg.to_string()),
+            )
         }
     }
 }
@@ -772,7 +776,9 @@ async fn get_timestamp_of_lsn_handler(
             let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
             json_response(StatusCode::OK, time)
         }
-        None => json_response(StatusCode::NOT_FOUND, ()),
+        None => Err(ApiError::NotFound(
+            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        )),
     }
 }
 

From 9e567d9814d139698dae041db849d201717ef58d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 17 Apr 2024 09:10:01 +0300
Subject: [PATCH 029/157] feat(neon_local): support listen addr for safekeeper
 (#7328)

Leftover from my LFC benchmarks. Safekeepers only listen on `127.0.0.1`
for `neon_local`. This pull request adds support for listening on other
address. To specify a custom address, modify `.neon/config`.

```
[[safekeepers]]
listen_addr = "192.168.?.?"
```

Endpoints created by neon_local still use 127.0.0.1 and I will fix them
later. I didn't fix it in the same pull request because my benchmark
setting does not use neon_local to create compute nodes so I don't know
how to fix it yet -- maybe replacing a few `127.0.0.1`s.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/local_env.rs  |  2 ++
 control_plane/src/safekeeper.rs | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index bd3dbef453..38b7fffd09 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
     pub remote_storage: Option<String>,
     pub backup_threads: Option<u32>,
     pub auth_enabled: bool,
+    pub listen_addr: Option<String>,
 }
 
 impl Default for SafekeeperConf {
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
             remote_storage: None,
             backup_threads: None,
             auth_enabled: false,
+            listen_addr: None,
         }
     }
 }
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 6ac71dfe51..d62a2e80b5 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
     pub pg_connection_config: PgConnectionConfig,
     pub env: LocalEnv,
     pub http_client: reqwest::Client,
+    pub listen_addr: String,
     pub http_base_url: String,
 }
 
 impl SafekeeperNode {
     pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
+            listen_addr.clone()
+        } else {
+            "127.0.0.1".to_string()
+        };
         SafekeeperNode {
             id: conf.id,
             conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
             env: env.clone(),
             http_client: reqwest::Client::new(),
-            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
+            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
+            listen_addr,
         }
     }
 
     /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
+    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
     }
 
     pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -111,8 +118,8 @@ impl SafekeeperNode {
         );
         io::stdout().flush().unwrap();
 
-        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
-        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
+        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
         let id = self.id;
         let datadir = self.datadir_path();
 
@@ -139,7 +146,7 @@ impl SafekeeperNode {
             availability_zone,
         ];
         if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
             args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
         }
         if !self.conf.sync {

From cb4b40f9c1afb6fe1dbf19691845dd65b187929e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 17 Apr 2024 09:11:04 +0300
Subject: [PATCH 030/157] chore(compute_ctl): add error context to apply_spec
 (#7374)

Make it faster to identify which part of apply spec goes wrong by adding
an error context.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 39 ++++++++++++++++---------
 compute_tools/src/spec.rs    | 55 +++++++++++++++++++++++-------------
 2 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 88dc4aca2b..40060f4117 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,9 +818,15 @@ impl ComputeNode {
                         Client::connect(zenith_admin_connstr.as_str(), NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                     // Disable forwarding so that users don't get a cloud_admin role
-                    client.simple_query("SET neon.forward_ddl = false")?;
-                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+
+                    let mut func = || {
+                        client.simple_query("SET neon.forward_ddl = false")?;
+                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                        Ok::<_, anyhow::Error>(())
+                    };
+                    func().context("apply_config setup cloud_admin")?;
+
                     drop(client);
 
                     // reconnect with connstring with expected name
@@ -832,24 +838,29 @@ impl ComputeNode {
         };
 
         // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client.simple_query("SET neon.forward_ddl = false")?;
+        client
+            .simple_query("SET neon.forward_ddl = false")
+            .context("apply_config SET neon.forward_ddl = false")?;
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
         let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client)?;
-        handle_roles(spec, &mut client)?;
-        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
+        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
+        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
+        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)
+            .context("apply_config handle_role_deletions")?;
         handle_grants(
             spec,
             &mut client,
             connstr.as_str(),
             self.has_feature(ComputeFeature::AnonExtension),
-        )?;
-        handle_extensions(spec, &mut client)?;
-        handle_extension_neon(&mut client)?;
-        create_availability_check_data(&mut client)?;
+        )
+        .context("apply_config handle_grants")?;
+        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
+        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
+        create_availability_check_data(&mut client)
+            .context("apply_config create_availability_check_data")?;
 
         // 'Close' connection
         drop(client);
@@ -857,7 +868,7 @@ impl ComputeNode {
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
+            handle_migrations(&mut client).context("apply_config handle_migrations")
         });
         Ok(())
     }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 5643634633..269177ee16 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -698,7 +698,8 @@ pub fn handle_grants(
 
         // it is important to run this after all grants
         if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)
+                .context("handle_grants handle_extension_anon")?;
         }
     }
 
@@ -813,28 +814,36 @@ $$;"#,
         // Add new migrations below.
     ];
 
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-    client.simple_query(query)?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
 
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-    client.simple_query(query)?;
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
 
-    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-    client.simple_query(query)?;
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
 
-    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-    client.simple_query(query)?;
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
 
-    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-    client.simple_query(query)?;
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
 
-    query = "SELECT id FROM neon_migration.migration_id";
-    let row = client.query_one(query, &[])?;
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
     let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
     let starting_migration_id = current_migration;
 
-    query = "BEGIN";
-    client.simple_query(query)?;
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
 
     while current_migration < migrations.len() {
         let migration = &migrations[current_migration];
@@ -842,7 +851,9 @@ $$;"#,
             info!("Skip migration id={}", current_migration);
         } else {
             info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration)?;
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
         }
         current_migration += 1;
     }
@@ -850,10 +861,14 @@ $$;"#,
         "UPDATE neon_migration.migration_id SET id={}",
         migrations.len()
     );
-    client.simple_query(&setval)?;
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
 
-    query = "COMMIT";
-    client.simple_query(query)?;
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
 
     info!(
         "Ran {} migrations",

From 41bb1e42b8aa6152d2f27c8f6535ce54748ef61e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 17 Apr 2024 11:50:58 +0300
Subject: [PATCH 031/157] CI(check-build-tools-image): fix getting build-tools
 image tag (#7402)

## Problem

For PRs, by default, we check out a phantom merge commit (merge a branch
into the main), but using a real branches head when finding `build-tools`
image tag.

## Summary of changes
- Change `COMMIT_SHA` to use `${{ github.sha }}` instead of `${{
github.event.pull_request.head.sha }}` for PRs

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/check-build-tools-image.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 28646dfc19..a1e22cf93f 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,7 +28,9 @@ jobs:
       - name: Get build-tools image tag for the current commit
         id: get-build-tools-tag
         env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
+          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
+          COMMIT_SHA: ${{ github.sha }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           LAST_BUILD_TOOLS_SHA=$(

From 13b9135d4eba2533d817ade229a2daf66f5f5eba Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 17 Apr 2024 11:11:49 +0200
Subject: [PATCH 032/157] proxy: Cleanup unused rate limiter (#7400)

## Problem

There is an unused dead code.

## Summary of changes

Let's remove it. In case we would need it in the future, we can always
return it back.

Also removed cli arguments. They shouldn't be used by anyone but us.
---
 proxy/src/bin/proxy.rs                    |  26 +-
 proxy/src/http.rs                         |   4 +-
 proxy/src/metrics.rs                      |  15 +-
 proxy/src/rate_limiter.rs                 |   5 -
 proxy/src/rate_limiter/aimd.rs            | 166 ---------
 proxy/src/rate_limiter/limit_algorithm.rs |  98 -----
 proxy/src/rate_limiter/limiter.rs         | 428 +---------------------
 proxy/src/usage_metrics.rs                |   4 +-
 8 files changed, 16 insertions(+), 730 deletions(-)
 delete mode 100644 proxy/src/rate_limiter/aimd.rs
 delete mode 100644 proxy/src/rate_limiter/limit_algorithm.rs

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 06ada991f3..cefab870cc 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -23,7 +23,6 @@ use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
@@ -132,14 +131,8 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
     /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_dynamic_rate_limiter: bool,
-    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
-    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
-    rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
-    /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    rate_limiter_timeout: tokio::time::Duration,
     /// Endpoint rate limiter max number of requests per second.
     ///
     /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
@@ -158,11 +151,6 @@ struct ProxyCliArgs {
     /// Redis rate limiter max number of requests per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
-    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
-    #[clap(long, default_value_t = 100)]
-    initial_limit: usize,
-    #[clap(flatten)]
-    aimd_config: proxy::rate_limiter::AimdConfig,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
@@ -497,13 +485,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
              and metric-collection-interval must be specified"
         ),
     };
-    let rate_limiter_config = RateLimiterConfig {
-        disable: args.disable_dynamic_rate_limiter,
-        algorithm: args.rate_limit_algorithm,
-        timeout: args.rate_limiter_timeout,
-        initial_limit: args.initial_limit,
-        aimd_config: Some(args.aimd_config),
-    };
+    if !args.disable_dynamic_rate_limiter {
+        bail!("dynamic rate limiter should be disabled");
+    }
 
     let auth_backend = match &args.auth_backend {
         AuthBackend::Console => {
@@ -545,7 +529,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
+            let endpoint = http::Endpoint::new(url, http::new_client());
 
             let api = console::provider::neon::Api::new(endpoint, caches, locks);
             let api = console::provider::ConsoleBackend::Console(api);
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 95ca0ccd5c..e20488e23c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -15,7 +15,6 @@ use tracing::trace;
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
-    rate_limiter,
     url::ApiUrl,
 };
 use reqwest_middleware::RequestBuilder;
@@ -23,7 +22,7 @@ use reqwest_middleware::RequestBuilder;
 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
-pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
+pub fn new_client() -> ClientWithMiddleware {
     let client = reqwest::ClientBuilder::new()
         .dns_resolver(Arc::new(GaiResolver::default()))
         .connection_verbose(true)
@@ -32,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien
 
     reqwest_middleware::ClientBuilder::new(client)
         .with(reqwest_tracing::TracingMiddleware::default())
-        .with(rate_limiter::Limiter::new(rate_limiter_config))
         .build()
 }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index b96950b0a2..3a4e54aea0 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,8 +4,8 @@ use lasso::ThreadedRodeo;
 use measured::{
     label::StaticLabelSet,
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -20,9 +20,6 @@ pub struct Metrics {
 
     #[metric(namespace = "wake_compute_lock")]
     pub wake_compute_lock: ApiLockMetrics,
-
-    // the one metric not called proxy_....
-    pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
 }
 
 impl Metrics {
@@ -31,7 +28,6 @@ impl Metrics {
         SELF.get_or_init(|| Metrics {
             proxy: ProxyMetrics::default(),
             wake_compute_lock: ApiLockMetrics::new(),
-            semaphore_control_plane_limit: GaugeVec::default(),
         })
     }
 }
@@ -286,13 +282,6 @@ pub enum LatencyExclusions {
     ClientAndCplane,
 }
 
-#[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "limit")]
-pub enum RateLimit {
-    Actual,
-    Expected,
-}
-
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 #[label(singleton = "kind")]
 pub enum SniKind {
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 2a7297ef81..c542267547 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,7 +1,2 @@
-mod aimd;
-mod limit_algorithm;
 mod limiter;
-pub use aimd::Aimd;
-pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
-pub use limiter::Limiter;
 pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs
deleted file mode 100644
index 2c14a54a6c..0000000000
--- a/proxy/src/rate_limiter/aimd.rs
+++ /dev/null
@@ -1,166 +0,0 @@
-use std::usize;
-
-use async_trait::async_trait;
-
-use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample};
-
-use super::limiter::Outcome;
-
-/// Loss-based congestion avoidance.
-///
-/// Additive-increase, multiplicative decrease.
-///
-/// Adds available currency when:
-/// 1. no load-based errors are observed, and
-/// 2. the utilisation of the current limit is high.
-///
-/// Reduces available concurrency by a factor when load-based errors are detected.
-pub struct Aimd {
-    min_limit: usize,
-    max_limit: usize,
-    decrease_factor: f32,
-    increase_by: usize,
-    min_utilisation_threshold: f32,
-}
-
-impl Aimd {
-    pub fn new(config: AimdConfig) -> Self {
-        Self {
-            min_limit: config.aimd_min_limit,
-            max_limit: config.aimd_max_limit,
-            decrease_factor: config.aimd_decrease_factor,
-            increase_by: config.aimd_increase_by,
-            min_utilisation_threshold: config.aimd_min_utilisation_threshold,
-        }
-    }
-}
-
-#[async_trait]
-impl LimitAlgorithm for Aimd {
-    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
-        match sample.outcome {
-            Success => {
-                let utilisation = sample.in_flight as f32 / old_limit as f32;
-
-                if utilisation > self.min_utilisation_threshold {
-                    let limit = old_limit + self.increase_by;
-                    limit.clamp(self.min_limit, self.max_limit)
-                } else {
-                    old_limit
-                }
-            }
-            Overload => {
-                let limit = old_limit as f32 * self.decrease_factor;
-
-                // Floor instead of round, so the limit reduces even with small numbers.
-                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
-                let limit = limit.floor() as usize;
-
-                limit.clamp(self.min_limit, self.max_limit)
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use tokio::sync::Notify;
-
-    use super::*;
-
-    use crate::rate_limiter::{Limiter, RateLimiterConfig};
-
-    #[tokio::test]
-    async fn should_decrease_limit_on_overload() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let release_notifier = Arc::new(Notify::new());
-
-        let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone());
-
-        let token = limiter.try_acquire().unwrap();
-        limiter.release(token, Some(Outcome::Overload)).await;
-        release_notifier.notified().await;
-        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
-    }
-
-    #[tokio::test]
-    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                aimd_increase_by: 1,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-        let _token = limiter.try_acquire().unwrap();
-        let _token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-        assert_eq!(limiter.state().limit(), 5, "success: increase");
-    }
-
-    #[tokio::test]
-    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-        assert_eq!(
-            limiter.state().limit(),
-            4,
-            "success: ignore when < half limit"
-        );
-    }
-
-    #[tokio::test]
-    async fn should_not_change_limit_when_no_outcome() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-        limiter.release(token, None).await;
-        assert_eq!(limiter.state().limit(), 10, "ignore");
-    }
-}
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
deleted file mode 100644
index 5cd2d5ebb7..0000000000
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ /dev/null
@@ -1,98 +0,0 @@
-//! Algorithms for controlling concurrency limits.
-use async_trait::async_trait;
-use std::time::Duration;
-
-use super::{limiter::Outcome, Aimd};
-
-/// An algorithm for controlling a concurrency limit.
-#[async_trait]
-pub trait LimitAlgorithm: Send + Sync + 'static {
-    /// Update the concurrency limit in response to a new job completion.
-    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize;
-}
-
-/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay).
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Sample {
-    pub(crate) latency: Duration,
-    /// Jobs in flight when the sample was taken.
-    pub(crate) in_flight: usize,
-    pub(crate) outcome: Outcome,
-}
-
-#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
-pub enum RateLimitAlgorithm {
-    Fixed,
-    #[default]
-    Aimd,
-}
-
-pub struct Fixed;
-
-#[async_trait]
-impl LimitAlgorithm for Fixed {
-    async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize {
-        old_limit
-    }
-}
-
-#[derive(Clone, Copy, Debug)]
-pub struct RateLimiterConfig {
-    pub disable: bool,
-    pub algorithm: RateLimitAlgorithm,
-    pub timeout: Duration,
-    pub initial_limit: usize,
-    pub aimd_config: Option<AimdConfig>,
-}
-
-impl RateLimiterConfig {
-    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
-        match self.algorithm {
-            RateLimitAlgorithm::Fixed => Box::new(Fixed),
-            RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory.
-        }
-    }
-}
-
-impl Default for RateLimiterConfig {
-    fn default() -> Self {
-        Self {
-            disable: true,
-            algorithm: RateLimitAlgorithm::Aimd,
-            timeout: Duration::from_secs(1),
-            initial_limit: 100,
-            aimd_config: Some(AimdConfig::default()),
-        }
-    }
-}
-
-#[derive(clap::Parser, Clone, Copy, Debug)]
-pub struct AimdConfig {
-    /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 1)]
-    pub aimd_min_limit: usize,
-    /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 1500)]
-    pub aimd_max_limit: usize,
-    /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 10)]
-    pub aimd_increase_by: usize,
-    /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 0.9)]
-    pub aimd_decrease_factor: f32,
-    /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 0.8)]
-    pub aimd_min_utilisation_threshold: f32,
-}
-
-impl Default for AimdConfig {
-    fn default() -> Self {
-        Self {
-            aimd_min_limit: 1,
-            aimd_max_limit: 1500,
-            aimd_increase_by: 10,
-            aimd_decrease_factor: 0.9,
-            aimd_min_utilisation_threshold: 0.8,
-        }
-    }
-}
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index a0a4e82fe5..3796b22ae9 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -4,7 +4,7 @@ use std::{
     hash::{BuildHasher, Hash},
     sync::{
         atomic::{AtomicUsize, Ordering},
-        Arc, Mutex,
+        Mutex,
     },
 };
 
@@ -12,19 +12,10 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
-use tokio::time::{timeout, Duration, Instant};
+use tokio::time::{Duration, Instant};
 use tracing::info;
 
-use crate::{
-    metrics::{Metrics, RateLimit},
-    EndpointId,
-};
-
-use super::{
-    limit_algorithm::{LimitAlgorithm, Sample},
-    RateLimiterConfig,
-};
+use crate::EndpointId;
 
 pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
@@ -245,423 +236,16 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     }
 }
 
-/// Limits the number of concurrent jobs.
-///
-/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
-/// token once the job is finished.
-///
-/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
-/// caused by overload (loss).
-pub struct Limiter {
-    limit_algo: AsyncMutex<Box<dyn LimitAlgorithm>>,
-    semaphore: std::sync::Arc<Semaphore>,
-    config: RateLimiterConfig,
-
-    // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED
-    limits: AtomicUsize,
-
-    // ONLY USE ATOMIC ADD/SUB
-    in_flight: Arc<AtomicUsize>,
-
-    #[cfg(test)]
-    notifier: Option<std::sync::Arc<tokio::sync::Notify>>,
-}
-
-/// A concurrency token, required to run a job.
-///
-/// Release the token back to the [Limiter] after the job is complete.
-#[derive(Debug)]
-pub struct Token<'t> {
-    permit: Option<tokio::sync::SemaphorePermit<'t>>,
-    start: Instant,
-    in_flight: Arc<AtomicUsize>,
-}
-
-/// A snapshot of the state of the [Limiter].
-///
-/// Not guaranteed to be consistent under high concurrency.
-#[derive(Debug, Clone, Copy)]
-pub struct LimiterState {
-    limit: usize,
-    in_flight: usize,
-}
-
-/// Whether a job succeeded or failed as a result of congestion/overload.
-///
-/// Errors not considered to be caused by overload should be ignored.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Outcome {
-    /// The job succeeded, or failed in a way unrelated to overload.
-    Success,
-    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
-    /// was observed.
-    Overload,
-}
-
-impl Outcome {
-    fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self {
-        match error {
-            reqwest_middleware::Error::Middleware(_) => Outcome::Success,
-            reqwest_middleware::Error::Reqwest(e) => {
-                if let Some(status) = e.status() {
-                    if status.is_server_error()
-                        || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status
-                    {
-                        Outcome::Overload
-                    } else {
-                        Outcome::Success
-                    }
-                } else {
-                    Outcome::Success
-                }
-            }
-        }
-    }
-    fn from_reqwest_response(response: &reqwest::Response) -> Self {
-        if response.status().is_server_error()
-            || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
-        {
-            Outcome::Overload
-        } else {
-            Outcome::Success
-        }
-    }
-}
-
-impl Limiter {
-    /// Create a limiter with a given limit control algorithm.
-    pub fn new(config: RateLimiterConfig) -> Self {
-        assert!(config.initial_limit > 0);
-        Self {
-            limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()),
-            semaphore: Arc::new(Semaphore::new(config.initial_limit)),
-            config,
-            limits: AtomicUsize::new(config.initial_limit),
-            in_flight: Arc::new(AtomicUsize::new(0)),
-            #[cfg(test)]
-            notifier: None,
-        }
-    }
-    // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self {
-    //     assert!(initial_limit > 0);
-
-    //     Self {
-    //         limit_algo: AsyncMutex::new(limit_algorithm),
-    //         semaphore: Arc::new(Semaphore::new(initial_limit)),
-    //         timeout,
-    //         limits: AtomicUsize::new(initial_limit),
-    //         in_flight: Arc::new(AtomicUsize::new(0)),
-    //         #[cfg(test)]
-    //         notifier: None,
-    //     }
-    // }
-
-    /// In some cases [Token]s are acquired asynchronously when updating the limit.
-    #[cfg(test)]
-    pub fn with_release_notifier(mut self, n: std::sync::Arc<tokio::sync::Notify>) -> Self {
-        self.notifier = Some(n);
-        self
-    }
-
-    /// Try to immediately acquire a concurrency [Token].
-    ///
-    /// Returns `None` if there are none available.
-    pub fn try_acquire(&self) -> Option<Token> {
-        let result = if self.config.disable {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Some(Token::new(None, self.in_flight.clone()))
-        } else {
-            self.semaphore
-                .try_acquire()
-                .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
-                .ok()
-        };
-        if result.is_some() {
-            self.in_flight.fetch_add(1, Ordering::AcqRel);
-        }
-        result
-    }
-
-    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `duration`.
-    pub async fn acquire_timeout(&self, duration: Duration) -> Option<Token<'_>> {
-        info!("acquiring token: {:?}", self.semaphore.available_permits());
-        let result = if self.config.disable {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Some(Token::new(None, self.in_flight.clone()))
-        } else {
-            match timeout(duration, self.semaphore.acquire()).await {
-                Ok(maybe_permit) => maybe_permit
-                    .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
-                    .ok(),
-                Err(_) => None,
-            }
-        };
-        if result.is_some() {
-            self.in_flight.fetch_add(1, Ordering::AcqRel);
-        }
-        result
-    }
-
-    /// Return the concurrency [Token], along with the outcome of the job.
-    ///
-    /// The [Outcome] of the job, and the time taken to perform it, may be used
-    /// to update the concurrency limit.
-    ///
-    /// Set the outcome to `None` to ignore the job.
-    pub async fn release(&self, mut token: Token<'_>, outcome: Option<Outcome>) {
-        tracing::info!("outcome is {:?}", outcome);
-        let in_flight = self.in_flight.load(Ordering::Acquire);
-        let old_limit = self.limits.load(Ordering::Acquire);
-        let available = if self.config.disable {
-            0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0.
-        } else {
-            self.semaphore.available_permits()
-        };
-        let total = in_flight + available;
-
-        let mut algo = self.limit_algo.lock().await;
-
-        let new_limit = if let Some(outcome) = outcome {
-            let sample = Sample {
-                latency: token.start.elapsed(),
-                in_flight,
-                outcome,
-            };
-            algo.update(old_limit, sample).await
-        } else {
-            old_limit
-        };
-        tracing::info!("new limit is {}", new_limit);
-        let actual_limit = if new_limit < total {
-            token.forget();
-            total.saturating_sub(1)
-        } else {
-            if !self.config.disable {
-                self.semaphore.add_permits(new_limit.saturating_sub(total));
-            }
-            new_limit
-        };
-        let metric = &Metrics::get().semaphore_control_plane_limit;
-        metric.set(RateLimit::Expected, new_limit as i64);
-        metric.set(RateLimit::Actual, actual_limit as i64);
-        self.limits.store(new_limit, Ordering::Release);
-        #[cfg(test)]
-        if let Some(n) = &self.notifier {
-            n.notify_one();
-        }
-    }
-
-    /// The current state of the limiter.
-    pub fn state(&self) -> LimiterState {
-        let limit = self.limits.load(Ordering::Relaxed);
-        let in_flight = self.in_flight.load(Ordering::Relaxed);
-        LimiterState { limit, in_flight }
-    }
-}
-
-impl<'t> Token<'t> {
-    fn new(permit: Option<SemaphorePermit<'t>>, in_flight: Arc<AtomicUsize>) -> Self {
-        Self {
-            permit,
-            start: Instant::now(),
-            in_flight,
-        }
-    }
-
-    pub fn forget(&mut self) {
-        if let Some(permit) = self.permit.take() {
-            permit.forget();
-        }
-    }
-}
-
-impl Drop for Token<'_> {
-    fn drop(&mut self) {
-        self.in_flight.fetch_sub(1, Ordering::AcqRel);
-    }
-}
-
-impl LimiterState {
-    /// The current concurrency limit.
-    pub fn limit(&self) -> usize {
-        self.limit
-    }
-    /// The number of jobs in flight.
-    pub fn in_flight(&self) -> usize {
-        self.in_flight
-    }
-}
-
-#[async_trait::async_trait]
-impl reqwest_middleware::Middleware for Limiter {
-    async fn handle(
-        &self,
-        req: reqwest::Request,
-        extensions: &mut task_local_extensions::Extensions,
-        next: reqwest_middleware::Next<'_>,
-    ) -> reqwest_middleware::Result<reqwest::Response> {
-        let timer = Metrics::get()
-            .proxy
-            .control_plane_token_acquire_seconds
-            .start_timer();
-        let token = self
-            .acquire_timeout(self.config.timeout)
-            .await
-            .ok_or_else(|| {
-                reqwest_middleware::Error::Middleware(
-                    // TODO: Should we map it into user facing errors?
-                    crate::console::errors::ApiError::Console {
-                        status: crate::http::StatusCode::TOO_MANY_REQUESTS,
-                        text: "Too many requests".into(),
-                    }
-                    .into(),
-                )
-            })?;
-        let duration = timer.observe();
-        info!(
-            ?duration,
-            "waiting for token to connect to the control plane"
-        );
-
-        match next.run(req, extensions).await {
-            Ok(response) => {
-                self.release(token, Some(Outcome::from_reqwest_response(&response)))
-                    .await;
-                Ok(response)
-            }
-            Err(e) => {
-                self.release(token, Some(Outcome::from_reqwest_error(&e)))
-                    .await;
-                Err(e)
-            }
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
+    use std::{hash::BuildHasherDefault, time::Duration};
 
-    use futures::{task::noop_waker_ref, Future};
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
     use tokio::time;
 
-    use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
-    use crate::{
-        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
-        EndpointId,
-    };
-
-    #[tokio::test]
-    async fn it_works() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 10,
-            disable: false,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-
-        assert_eq!(limiter.state().limit(), 10);
-    }
-
-    #[tokio::test]
-    async fn is_fair() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 1,
-            disable: false,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        // === TOKEN 1 ===
-        let token1 = limiter.try_acquire().unwrap();
-
-        let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
-        assert!(
-            token2_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token1"
-        );
-
-        let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
-        assert!(
-            token3_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token1"
-        );
-
-        limiter.release(token1, Some(Outcome::Success)).await;
-        // === END TOKEN 1 ===
-
-        // === TOKEN 2 ===
-        assert!(
-            limiter.try_acquire().is_none(),
-            "token is acquired by token2"
-        );
-
-        assert!(
-            token3_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token2"
-        );
-
-        let token2 = token2_fut.await.unwrap();
-
-        limiter.release(token2, Some(Outcome::Success)).await;
-        // === END TOKEN 2 ===
-
-        // === TOKEN 3 ===
-        assert!(
-            limiter.try_acquire().is_none(),
-            "token is acquired by token3"
-        );
-
-        let token3 = token3_fut.await.unwrap();
-        limiter.release(token3, Some(Outcome::Success)).await;
-        // === END TOKEN 3 ===
-
-        // === TOKEN 4 ===
-        let token4 = limiter.try_acquire().unwrap();
-        limiter.release(token4, Some(Outcome::Success)).await;
-    }
-
-    #[tokio::test]
-    async fn disable() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 1,
-            disable: true,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        // === TOKEN 1 ===
-        let token1 = limiter.try_acquire().unwrap();
-        let token2 = limiter.try_acquire().unwrap();
-        let state = limiter.state();
-        assert_eq!(state.limit(), 1);
-        assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected.
-        limiter.release(token1, None).await;
-        limiter.release(token2, None).await;
-    }
+    use super::{BucketRateLimiter, EndpointRateLimiter};
+    use crate::{rate_limiter::RateBucketInfo, EndpointId};
 
     #[test]
     fn rate_bucket_rpi() {
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 5ffbf95c07..56ed2145dc 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -495,7 +495,7 @@ mod tests {
     use url::Url;
 
     use super::*;
-    use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
+    use crate::{http, BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {
@@ -525,7 +525,7 @@ mod tests {
         tokio::spawn(server);
 
         let metrics = Metrics::default();
-        let client = http::new_client(RateLimiterConfig::default());
+        let client = http::new_client();
         let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
         let now = Utc::now();
 

From e49e931bc44c0ebe52a90db865b64c87f3281c92 Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Wed, 17 Apr 2024 11:23:55 +0200
Subject: [PATCH 033/157] Add for `add-help-for-timeline-arg` for `timeline`
 command (#7361)

## Problem

When calling `./neon_local timeline` a confusing error message pops up:
`command failed: no tenant subcommand provided`

## Summary of changes
Add `add-help-for-timeline-arg` for timeline commands so when no
argument for the timeline is provided help is printed.
---
 control_plane/src/bin/neon_local.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 68a5474c87..7f8f6d21e0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1417,6 +1417,7 @@ fn cli() -> Command {
         .subcommand(
             Command::new("timeline")
             .about("Manage timelines")
+            .arg_required_else_help(true)
             .subcommand(Command::new("list")
                 .about("List all timelines, available to this pageserver")
                 .arg(tenant_id_arg.clone()))

From 3023de156e35db166d8d24a4d298f36f558593eb Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 17 Apr 2024 11:32:07 +0100
Subject: [PATCH 034/157] pageserver: demote range end fallback log (#7403)

## Problem
This trace is emitted whenever a vectored read touches the end of a
delta layer file. It's a perfectly normal case, but I expected it to be
more rare when implementing the code.

## Summary of changes
Demote log to debug.
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 466d95f46d..255855a246 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -939,7 +939,7 @@ impl DeltaLayerInner {
             }
 
             if !range_end_handled {
-                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                tracing::debug!("Handling range end fallback at {}", data_end_offset);
                 planner.handle_range_end(data_end_offset);
             }
         }

From fd49005cb3016da98e6f0f6305549a601e7ebc7b Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 17 Apr 2024 13:33:31 +0200
Subject: [PATCH 035/157] proxy: Improve logging (#7405)

## Problem

It's unclear from logs what's going on with the regional redis.

## Summary of changes

Make logs better.
---
 proxy/src/bin/proxy.rs                          |  4 +++-
 proxy/src/cache/endpoints.rs                    |  9 ++++++++-
 proxy/src/context.rs                            | 17 +++++++++++++++--
 .../connection_with_credentials_provider.rs     | 16 ++++++++++++++--
 4 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index cefab870cc..71283dd606 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -42,6 +42,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use tracing::Instrument;
 use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
 
 project_git_version!(GIT_VERSION);
@@ -418,7 +419,8 @@ async fn main() -> anyhow::Result<()> {
             if let Some(regional_redis_client) = regional_redis_client {
                 let cache = api.caches.endpoints_cache.clone();
                 let con = regional_redis_client;
-                maintenance_tasks.spawn(async move { cache.do_read(con).await });
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
             }
         }
     }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index f3f9e9395f..72543c6408 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -13,6 +13,7 @@ use redis::{
 };
 use serde::Deserialize;
 use tokio::sync::Mutex;
+use tracing::info;
 
 use crate::{
     config::EndpointCacheConfig,
@@ -71,7 +72,9 @@ impl EndpointsCache {
         }
         // If cache is disabled, just collect the metrics and return.
         if self.config.disable_cache {
-            ctx.set_rejected(self.should_reject(endpoint));
+            let rejected = self.should_reject(endpoint);
+            ctx.set_rejected(rejected);
+            info!(?rejected, "check endpoint is valid, disabled cache");
             return true;
         }
         // If the limiter allows, we don't need to check the cache.
@@ -79,6 +82,7 @@ impl EndpointsCache {
             return true;
         }
         let rejected = self.should_reject(endpoint);
+        info!(?rejected, "check endpoint is valid, enabled cache");
         ctx.set_rejected(rejected);
         !rejected
     }
@@ -171,6 +175,9 @@ impl EndpointsCache {
 
             if res.keys.is_empty() {
                 if return_when_finish {
+                    if total != 0 {
+                        break;
+                    }
                     anyhow::bail!(
                         "Redis stream {} is empty, cannot be used to filter endpoints",
                         self.config.stream_name
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index d7b5be5534..95c74e6cca 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,7 +5,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{field::display, info_span, Span};
+use tracing::{field::display, info, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
@@ -198,12 +198,25 @@ impl Drop for RequestMonitoring {
         } else {
             ConnectOutcome::Failed
         };
+        let rejected = self.rejected;
+        let ep = self
+            .endpoint_id
+            .as_ref()
+            .map(|x| x.as_str())
+            .unwrap_or_default();
+        // This makes sense only if cache is disabled
+        info!(
+            ?ep,
+            ?outcome,
+            ?rejected,
+            "check endpoint is valid with outcome"
+        );
         Metrics::get()
             .proxy
             .invalid_endpoints_total
             .inc(InvalidEndpointsGroup {
                 protocol: self.protocol,
-                rejected: self.rejected.into(),
+                rejected: rejected.into(),
                 outcome,
             });
         if let Some(tx) = self.sender.take() {
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index d183abb53a..3a90d911c2 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
+    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
+        redis::cmd("PING").query_async(con).await
+    }
+
     pub async fn connect(&mut self) -> anyhow::Result<()> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
-            match redis::cmd("PING").query_async(con).await {
+            match Self::ping(con).await {
                 Ok(()) => {
                     return Ok(());
                 }
@@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider {
         if let Some(f) = self.refresh_token_task.take() {
             f.abort()
         }
-        let con = self
+        let mut con = self
             .get_client()
             .await?
             .get_multiplexed_tokio_connection()
@@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider {
             });
             self.refresh_token_task = Some(f);
         }
+        match Self::ping(&mut con).await {
+            Ok(()) => {
+                info!("Connection succesfully established");
+            }
+            Err(e) => {
+                error!("Connection is broken. Error during PING: {e:?}");
+            }
+        }
         self.con = Some(con);
         Ok(())
     }

From d5708e74357ca19146098770895356326542306e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 17 Apr 2024 14:16:11 +0200
Subject: [PATCH 036/157] proxy: Record role to span (#7407)

## Problem

## Summary of changes

Add dbrole to span.
---
 proxy/src/context.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 95c74e6cca..8cd3024fcf 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -76,6 +76,7 @@ impl RequestMonitoring {
             ?session_id,
             %peer_addr,
             ep = tracing::field::Empty,
+            role = tracing::field::Empty,
         );
 
         Self {
@@ -157,6 +158,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_user(&mut self, user: RoleName) {
+        self.span.record("role", display(&user));
         self.user = Some(user);
     }
 

From a54ea8fb1cd26396a06d2fd715bcf19b8b7a7226 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 18 Apr 2024 06:00:33 +0100
Subject: [PATCH 037/157] proxy: move endpoint rate limiter (#7413)

## Problem

## Summary of changes

Rate limit for wake_compute calls
---
 proxy/src/bin/proxy.rs             | 12 +++++-------
 proxy/src/config.rs                |  1 -
 proxy/src/console/provider.rs      |  6 ++++++
 proxy/src/console/provider/neon.rs | 12 ++++++++++++
 proxy/src/proxy.rs                 | 16 +---------------
 proxy/src/proxy/wake_compute.rs    |  1 +
 proxy/src/rate_limiter/limiter.rs  | 26 +++++++++++---------------
 proxy/src/serverless.rs            | 18 +++---------------
 proxy/src/serverless/websocket.rs  |  3 ---
 9 files changed, 39 insertions(+), 56 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 71283dd606..b54f8c131c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -331,7 +331,6 @@ async fn main() -> anyhow::Result<()> {
     let proxy_listener = TcpListener::bind(proxy_address).await?;
     let cancellation_token = CancellationToken::new();
 
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     let cancel_map = CancelMap::default();
 
     let redis_publisher = match &regional_redis_client {
@@ -357,7 +356,6 @@ async fn main() -> anyhow::Result<()> {
         config,
         proxy_listener,
         cancellation_token.clone(),
-        endpoint_rate_limiter.clone(),
         cancellation_handler.clone(),
     ));
 
@@ -372,7 +370,6 @@ async fn main() -> anyhow::Result<()> {
             config,
             serverless_listener,
             cancellation_token.clone(),
-            endpoint_rate_limiter.clone(),
             cancellation_handler.clone(),
         ));
     }
@@ -533,7 +530,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let api = console::provider::neon::Api::new(endpoint, caches, locks);
+            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+            let api =
+                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
             let api = console::provider::ConsoleBackend::Console(api);
             auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
@@ -567,8 +568,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
-    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
     let mut redis_rps_limit = args.redis_rps_limit.clone();
     RateBucketInfo::validate(&mut redis_rps_limit)?;
 
@@ -581,7 +580,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         authentication_config,
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
-        endpoint_rps_limit,
         redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 7b4c02393b..f9519c7645 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -29,7 +29,6 @@ pub struct ProxyConfig {
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
-    pub endpoint_rps_limit: Vec<RateBucketInfo>,
     pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 3fa7221f98..aa1800a9da 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -208,6 +208,9 @@ pub mod errors {
         #[error(transparent)]
         ApiError(ApiError),
 
+        #[error("Too many connections attempts")]
+        TooManyConnections,
+
         #[error("Timeout waiting to acquire wake compute lock")]
         TimeoutError,
     }
@@ -240,6 +243,8 @@ pub mod errors {
                 // However, API might return a meaningful error.
                 ApiError(e) => e.to_string_client(),
 
+                TooManyConnections => self.to_string(),
+
                 TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
             }
         }
@@ -250,6 +255,7 @@ pub mod errors {
             match self {
                 WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                 WakeComputeError::ApiError(e) => e.get_error_kind(),
+                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
                 WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
             }
         }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 138acdf578..58b2a1570c 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -12,6 +12,7 @@ use crate::{
     console::messages::ColdStartInfo,
     http,
     metrics::{CacheOutcome, Metrics},
+    rate_limiter::EndpointRateLimiter,
     scram, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
@@ -25,6 +26,7 @@ pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub locks: &'static ApiLocks,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
 
@@ -34,6 +36,7 @@ impl Api {
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
@@ -43,6 +46,7 @@ impl Api {
             endpoint,
             caches,
             locks,
+            endpoint_rate_limiter,
             jwt,
         }
     }
@@ -277,6 +281,14 @@ impl super::Api for Api {
             return Ok(cached);
         }
 
+        // check rate limit
+        if !self
+            .endpoint_rate_limiter
+            .check(user_info.endpoint.normalize().into(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
         let permit = self.locks.get_wake_compute_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index f80ced91c8..4321bad968 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,9 +19,8 @@ use crate::{
     metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::WithClientIp,
     proxy::handshake::{handshake, HandshakeData},
-    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
-    EndpointCacheKey, Normalize,
+    EndpointCacheKey,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -61,7 +60,6 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -86,7 +84,6 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
-        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
@@ -128,7 +125,6 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 ClientMode::Tcp,
-                endpoint_rate_limiter,
                 conn_gauge,
             )
             .instrument(span.clone())
@@ -242,7 +238,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
@@ -288,15 +283,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         Err(e) => stream.throw_error(e).await?,
     };
 
-    // check rate limit
-    if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep.normalize(), 1) {
-            return stream
-                .throw_error(auth::AuthError::too_many_connections())
-                .await?;
-        }
-    }
-
     let user = user_info.get_user().to_owned();
     let user_info = match user_info
         .authenticate(
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index f8154b1a94..fe228ab33d 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -90,6 +90,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
         WakeComputeError::ApiError(ApiError::Console { .. }) => {
             WakeupFailureKind::ApiConsoleOtherError
         }
+        WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
         WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
     };
     Metrics::get()
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 3796b22ae9..5ba2c36436 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -15,7 +15,7 @@ use rand::{rngs::StdRng, Rng, SeedableRng};
 use tokio::time::{Duration, Instant};
 use tracing::info;
 
-use crate::EndpointId;
+use crate::intern::EndpointIdInt;
 
 pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
@@ -61,12 +61,7 @@ impl GlobalRateLimiter {
 // Purposefully ignore user name and database name as clients can reconnect
 // with different names, so we'll end up sending some http requests to
 // the control plane.
-//
-// We also may save quite a lot of CPU (I think) by bailing out right after we
-// saw SNI, before doing TLS handshake. User-side error messages in that case
-// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
-// I went with a more expensive way that yields user-friendlier error messages.
-pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
+pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
     map: DashMap<Key, Vec<RateBucket>, Hasher>,
@@ -245,7 +240,7 @@ mod tests {
     use tokio::time;
 
     use super::{BucketRateLimiter, EndpointRateLimiter};
-    use crate::{rate_limiter::RateBucketInfo, EndpointId};
+    use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
 
     #[test]
     fn rate_bucket_rpi() {
@@ -295,39 +290,40 @@ mod tests {
         let limiter = EndpointRateLimiter::new(rates);
 
         let endpoint = EndpointId::from("ep-my-endpoint-1234");
+        let endpoint = EndpointIdInt::from(endpoint);
 
         time::pause();
 
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint, 1));
         }
         // more connections fail
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // fail even after 500ms as it's in the same bucket
         time::advance(time::Duration::from_millis(500)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // after a full 1s, 100 requests are allowed again
         time::advance(time::Duration::from_millis(500)).await;
         for _ in 1..6 {
             for _ in 0..50 {
-                assert!(limiter.check(endpoint.clone(), 2));
+                assert!(limiter.check(endpoint, 2));
             }
             time::advance(time::Duration::from_millis(1000)).await;
         }
 
         // more connections after 600 will exceed the 20rps@30s limit
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // will still fail before the 30 second limit
         time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // after the full 30 seconds, 100 requests are allowed again
         time::advance(time::Duration::from_millis(1)).await;
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint, 1));
         }
     }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index f3c42cdb01..b0f4026c76 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -35,7 +35,6 @@ use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
 use crate::protocol2::WithClientIp;
 use crate::proxy::run_until_cancelled;
-use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
@@ -53,7 +52,6 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -117,7 +115,6 @@ pub async fn task_main(
                 backend.clone(),
                 connections.clone(),
                 cancellation_handler.clone(),
-                endpoint_rate_limiter.clone(),
                 cancellation_token.clone(),
                 server.clone(),
                 tls_acceptor.clone(),
@@ -147,7 +144,6 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     server: Builder<TokioExecutor>,
     tls_acceptor: TlsAcceptor,
@@ -231,7 +227,6 @@ async fn connection_handler(
                     cancellation_handler.clone(),
                     session_id,
                     peer_addr,
-                    endpoint_rate_limiter.clone(),
                     http_request_token,
                 )
                 .in_current_span()
@@ -270,7 +265,6 @@ async fn request_handler(
     cancellation_handler: Arc<CancellationHandlerMain>,
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
@@ -298,15 +292,9 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                if let Err(e) = websocket::serve_websocket(
-                    config,
-                    ctx,
-                    websocket,
-                    cancellation_handler,
-                    host,
-                    endpoint_rate_limiter,
-                )
-                .await
+                if let Err(e) =
+                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
+                        .await
                 {
                     error!("error in websocket connection: {e:#}");
                 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index d054877126..eddd278b7d 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -5,7 +5,6 @@ use crate::{
     error::{io_error, ReportableError},
     metrics::Metrics,
     proxy::{handle_client, ClientMode},
-    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -136,7 +135,6 @@ pub async fn serve_websocket(
     websocket: HyperWebsocket,
     cancellation_handler: Arc<CancellationHandlerMain>,
     hostname: Option<String>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     let conn_gauge = Metrics::get()
@@ -150,7 +148,6 @@ pub async fn serve_websocket(
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
-        endpoint_rate_limiter,
         conn_gauge,
     )
     .await;

From 5191f6ef0e381887981d40e4f8001ff63c9abc8e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:09:12 +0200
Subject: [PATCH 038/157] proxy: Record only valid rejected events (#7415)

## Problem

Sometimes rejected metric might record invalid events.

## Summary of changes

* Only record it `rejected` was explicitly set.
* Change order in logs.
* Report metrics if not under high-load.
---
 proxy/src/cache/endpoints.rs | 18 +++++--------
 proxy/src/context.rs         | 49 ++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 72543c6408..2aa1986d5e 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -70,20 +70,14 @@ impl EndpointsCache {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            let rejected = self.should_reject(endpoint);
-            ctx.set_rejected(rejected);
-            info!(?rejected, "check endpoint is valid, disabled cache");
-            return true;
-        }
-        // If the limiter allows, we don't need to check the cache.
-        if self.limiter.lock().await.check() {
-            return true;
-        }
         let rejected = self.should_reject(endpoint);
-        info!(?rejected, "check endpoint is valid, enabled cache");
         ctx.set_rejected(rejected);
+        info!(?rejected, "check endpoint is valid, disabled cache");
+        // If cache is disabled, just collect the metrics and return or
+        // If the limiter allows, we don't need to check the cache.
+        if self.config.disable_cache || self.limiter.lock().await.check() {
+            return true;
+        }
         !rejected
     }
     fn should_reject(&self, endpoint: &EndpointId) -> bool {
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 8cd3024fcf..17b82c08aa 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -51,7 +51,7 @@ pub struct RequestMonitoring {
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
     // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: bool,
+    rejected: Option<bool>,
 }
 
 #[derive(Clone, Debug)]
@@ -96,7 +96,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            rejected: false,
+            rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -118,7 +118,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = rejected;
+        self.rejected = Some(rejected);
     }
 
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
@@ -200,27 +200,28 @@ impl Drop for RequestMonitoring {
         } else {
             ConnectOutcome::Failed
         };
-        let rejected = self.rejected;
-        let ep = self
-            .endpoint_id
-            .as_ref()
-            .map(|x| x.as_str())
-            .unwrap_or_default();
-        // This makes sense only if cache is disabled
-        info!(
-            ?ep,
-            ?outcome,
-            ?rejected,
-            "check endpoint is valid with outcome"
-        );
-        Metrics::get()
-            .proxy
-            .invalid_endpoints_total
-            .inc(InvalidEndpointsGroup {
-                protocol: self.protocol,
-                rejected: rejected.into(),
-                outcome,
-            });
+        if let Some(rejected) = self.rejected {
+            let ep = self
+                .endpoint_id
+                .as_ref()
+                .map(|x| x.as_str())
+                .unwrap_or_default();
+            // This makes sense only if cache is disabled
+            info!(
+                ?outcome,
+                ?rejected,
+                ?ep,
+                "check endpoint is valid with outcome"
+            );
+            Metrics::get()
+                .proxy
+                .invalid_endpoints_total
+                .inc(InvalidEndpointsGroup {
+                    protocol: self.protocol,
+                    rejected: rejected.into(),
+                    outcome,
+                });
+        }
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }

From 8d0f7017678b1c54f415da9de212d2749e6af9b2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Apr 2024 10:43:04 +0300
Subject: [PATCH 039/157] feat: copy delta layer prefix or "truncate" (#7228)

For "timeline ancestor merge" or "timeline detach," we need to "cut"
delta layers at particular LSN. The name "truncate" is not used as it
would imply that a layer file changes, instead of what happens: we copy
keys with Lsn less than a "cut point".

Cc: #6994

Add the "copy delta layer prefix" operation to DeltaLayerInner, re-using
some of the vectored read internals. The code is `cfg(test)` until it
will be used later with a more complete integration test.
---
 pageserver/src/repository.rs                  | 138 ++++++
 .../src/tenant/storage_layer/delta_layer.rs   | 445 +++++++++++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  42 +-
 pageserver/src/tenant/vectored_blob_io.rs     |  25 +-
 pageserver/src/walrecord.rs                   |   1 +
 5 files changed, 632 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 9959d105eb..0a9ac50aad 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,52 @@ impl Value {
     }
 }
 
+#[cfg(test)]
+#[derive(Debug, PartialEq)]
+pub(crate) enum InvalidInput {
+    TooShortValue,
+    TooShortPostgresRecord,
+}
+
+/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
+/// use this type for querying if a slice looks some particular way.
+#[cfg(test)]
+pub(crate) struct ValueBytes;
+
+#[cfg(test)]
+impl ValueBytes {
+    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+        if raw.len() < 12 {
+            return Err(InvalidInput::TooShortValue);
+        }
+
+        let value_discriminator = &raw[0..4];
+
+        if value_discriminator == [0, 0, 0, 0] {
+            // Value::Image always initializes
+            return Ok(true);
+        }
+
+        if value_discriminator != [0, 0, 0, 1] {
+            // not a Value::WalRecord(..)
+            return Ok(false);
+        }
+
+        let walrecord_discriminator = &raw[4..8];
+
+        if walrecord_discriminator != [0, 0, 0, 0] {
+            // only NeonWalRecord::Postgres can have will_init
+            return Ok(false);
+        }
+
+        if raw.len() < 17 {
+            return Err(InvalidInput::TooShortPostgresRecord);
+        }
+
+        Ok(raw[8] == 1)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -70,6 +116,8 @@ mod test {
         ];
 
         roundtrip!(image, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
     }
 
     #[test]
@@ -93,6 +141,96 @@ mod test {
         ];
 
         roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_image() {
+        let rec = Value::Image(Bytes::from_static(b""));
+
+        #[rustfmt::skip]
+        let expected = [
+            // top level discriminator of 4 bytes
+            0x00, 0x00, 0x00, 0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 12);
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            );
+        }
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_postgres_record() {
+        let rec = NeonWalRecord::Postgres {
+            will_init: false,
+            rec: Bytes::from_static(b""),
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // flattened discriminator of total 8 bytes
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x00,
+            // will_init
+            0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 17);
+        for len in 12..17 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortPostgresRecord
+            )
+        }
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            )
+        }
+    }
+
+    #[test]
+    fn clear_visibility_map_flags_example() {
+        let rec = NeonWalRecord::ClearVisibilityMapFlags {
+            new_heap_blkno: Some(0x11),
+            old_heap_blkno: None,
+            flags: 0x03,
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // discriminators
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x01,
+            // Some == 1 followed by 4 bytes
+            0x01, 0x00, 0x00, 0x00, 0x11,
+            // None == 0
+            0x00,
+            // flags
+            0x03
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 255855a246..c5b5e5c98f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -20,8 +20,8 @@
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //! ```
 //!
-//! Every delta file consists of three parts: "summary", "index", and
-//! "values". The summary is a fixed size header at the beginning of the file,
+//! Every delta file consists of three parts: "summary", "values", and
+//! "index". The summary is a fixed size header at the beginning of the file,
 //! and it contains basic information about the layer, and offsets to the other
 //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
 //! "values" part.  The actual page images and WAL records are stored in the
@@ -863,7 +863,7 @@ impl DeltaLayerInner {
                 .into(),
         );
 
-        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let data_end_offset = self.index_start_offset();
 
         let reads = Self::plan_reads(
             keyspace,
@@ -1103,11 +1103,195 @@ impl DeltaLayerInner {
         if let Some(last) = all_keys.last_mut() {
             // Last key occupies all space till end of value storage,
             // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_offset() - last.size;
         }
         Ok(all_keys)
     }
 
+    /// Using the given writer, write out a truncated version, where LSNs higher than the
+    /// truncate_at are missing.
+    #[cfg(test)]
+    pub(super) async fn copy_prefix(
+        &self,
+        writer: &mut DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use crate::tenant::vectored_blob_io::{
+            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
+        };
+        use futures::stream::TryStreamExt;
+
+        #[derive(Debug)]
+        enum Item {
+            Actual(Key, Lsn, BlobRef),
+            Sentinel,
+        }
+
+        impl From<Item> for Option<(Key, Lsn, BlobRef)> {
+            fn from(value: Item) -> Self {
+                match value {
+                    Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
+                    Item::Sentinel => None,
+                }
+            }
+        }
+
+        impl Item {
+            fn offset(&self) -> Option<BlobRef> {
+                match self {
+                    Item::Actual(_, _, blob) => Some(*blob),
+                    Item::Sentinel => None,
+                }
+            }
+
+            fn is_last(&self) -> bool {
+                matches!(self, Item::Sentinel)
+            }
+        }
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
+        let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
+        // put in a sentinel value for getting the end offset for last item, and not having to
+        // repeat the whole read part
+        let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
+            Item::Sentinel,
+        ))));
+        let mut stream = std::pin::pin!(stream);
+
+        let mut prev: Option<(Key, Lsn, BlobRef)> = None;
+
+        let mut read_builder: Option<VectoredReadBuilder> = None;
+
+        let max_read_size = self
+            .max_vectored_read_bytes
+            .map(|x| x.0.get())
+            .unwrap_or(8192);
+
+        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+
+        // FIXME: buffering of DeltaLayerWriter
+        let mut per_blob_copy = Vec::new();
+
+        while let Some(item) = stream.try_next().await? {
+            tracing::debug!(?item, "popped");
+            let offset = item
+                .offset()
+                .unwrap_or(BlobRef::new(self.index_start_offset(), false));
+
+            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
+                let end_offset = offset;
+
+                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+            } else {
+                None
+            };
+
+            let is_last = item.is_last();
+
+            prev = Option::from(item);
+
+            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
+
+            let builder = if let Some((meta, offsets)) = actionable {
+                // extend or create a new builder
+                if read_builder
+                    .as_mut()
+                    .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
+                    .unwrap_or(VectoredReadExtended::No)
+                    == VectoredReadExtended::Yes
+                {
+                    None
+                } else {
+                    read_builder.replace(VectoredReadBuilder::new(
+                        offsets.start.pos(),
+                        offsets.end.pos(),
+                        meta,
+                        max_read_size,
+                    ))
+                }
+            } else {
+                // nothing to do, except perhaps flush any existing for the last element
+                None
+            };
+
+            // flush the possible older builder and also the new one if the item was the last one
+            let builders = builder.into_iter();
+            let builders = if is_last {
+                builders.chain(read_builder.take())
+            } else {
+                builders.chain(None)
+            };
+
+            for builder in builders {
+                let read = builder.build();
+
+                let reader = VectoredBlobReader::new(&self.file);
+
+                let mut buf = buffer.take().unwrap();
+
+                buf.clear();
+                buf.reserve(read.size());
+                let res = reader.read_blobs(&read, buf).await?;
+
+                for blob in res.blobs {
+                    let key = blob.meta.key;
+                    let lsn = blob.meta.lsn;
+                    let data = &res.buf[blob.start..blob.end];
+
+                    #[cfg(debug_assertions)]
+                    Value::des(data)
+                        .with_context(|| {
+                            format!(
+                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
+                                blob.meta.key,
+                                blob.meta.lsn,
+                                blob.start,
+                                blob.end,
+                                utils::Hex(data)
+                            )
+                        })
+                        .unwrap();
+
+                    // is it an image or will_init walrecord?
+                    // FIXME: this could be handled by threading the BlobRef to the
+                    // VectoredReadBuilder
+                    let will_init = crate::repository::ValueBytes::will_init(data)
+                        .inspect_err(|_e| {
+                            #[cfg(feature = "testing")]
+                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
+                        })
+                        .unwrap_or(false);
+
+                    per_blob_copy.clear();
+                    per_blob_copy.extend_from_slice(data);
+
+                    let (tmp, res) = writer
+                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
+                        .await;
+                    per_blob_copy = tmp;
+                    res?;
+                }
+
+                buffer = Some(res.buf);
+            }
+        }
+
+        assert!(
+            read_builder.is_none(),
+            "with the sentinel above loop should had handled all"
+        );
+
+        Ok(())
+    }
+
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         println!(
             "index_start_blk: {}, root {}",
@@ -1177,6 +1361,44 @@ impl DeltaLayerInner {
 
         Ok(())
     }
+
+    #[cfg(test)]
+    fn stream_index_forwards<'a, R>(
+        &'a self,
+        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
+        start: &'a [u8; DELTA_KEY_SIZE],
+        ctx: &'a RequestContext,
+    ) -> impl futures::stream::Stream<
+        Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
+    > + 'a
+    where
+        R: BlockReader,
+    {
+        use futures::stream::TryStreamExt;
+        let stream = reader.get_stream_from(start, ctx);
+        stream.map_ok(|(key, value)| {
+            let key = DeltaKey::from_slice(&key);
+            let (key, lsn) = (key.key(), key.lsn());
+            let offset = BlobRef(value);
+
+            (key, lsn, offset)
+        })
+    }
+
+    /// The file offset to the first block of index.
+    ///
+    /// The file structure is summary, values, and index. We often need this for the size of last blob.
+    fn index_start_offset(&self) -> u64 {
+        let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let bref = BlobRef(offset);
+        tracing::debug!(
+            index_start_blk = self.index_start_blk,
+            offset,
+            pos = bref.pos(),
+            "index_start_offset"
+        );
+        offset
+    }
 }
 
 /// A set of data associated with a delta layer key and its value
@@ -1538,7 +1760,7 @@ mod test {
 
         let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
 
-        let inner = resident.get_inner_delta(&ctx).await?;
+        let inner = resident.as_delta(&ctx).await?;
 
         let file_size = inner.file.metadata().await?.len();
         tracing::info!(
@@ -1594,4 +1816,217 @@ mod test {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn copy_delta_prefix_smoke() {
+        use crate::walrecord::NeonWalRecord;
+        use bytes::Bytes;
+
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let ctx = &ctx;
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
+            .await
+            .unwrap();
+
+        let initdb_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .next()
+            .unwrap();
+
+        {
+            let mut writer = timeline.writer().await;
+
+            let data = [
+                (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
+                (
+                    0x30,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: false,
+                        rec: Bytes::from_static(b"1"),
+                    }),
+                ),
+                (
+                    0x40,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"2"),
+                    }),
+                ),
+                // build an oversized value so we cannot extend and existing read over
+                // this
+                (
+                    0x50,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: {
+                            let mut buf =
+                                vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
+                            buf.iter_mut()
+                                .enumerate()
+                                .for_each(|(i, slot)| *slot = (i % 256) as u8);
+                            Bytes::from(buf)
+                        },
+                    }),
+                ),
+                // because the oversized read cannot be extended further, we are sure to exercise the
+                // builder created on the last round with this:
+                (
+                    0x60,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"3"),
+                    }),
+                ),
+                (
+                    0x60,
+                    9,
+                    Value::Image(Bytes::from_static(b"something for a different key")),
+                ),
+            ];
+
+            let mut last_lsn = None;
+
+            for (lsn, key, value) in data {
+                let key = Key::from_i128(key);
+                writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
+                last_lsn = Some(lsn);
+            }
+
+            writer.finish_write(Lsn(last_lsn.unwrap()));
+        }
+        timeline.freeze_and_flush().await.unwrap();
+
+        let new_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .find(|x| x != &initdb_layer)
+            .unwrap();
+
+        // create a copy for the timeline, so we don't overwrite the file
+        let branch = tenant
+            .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
+
+        // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
+        // a single key
+
+        for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
+            let truncate_at = Lsn(truncate_at);
+
+            let mut writer = DeltaLayerWriter::new(
+                tenant.conf,
+                branch.timeline_id,
+                tenant.tenant_shard_id,
+                Key::MIN,
+                Lsn(0x11)..truncate_at,
+            )
+            .await
+            .unwrap();
+
+            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
+
+            new_layer
+                .copy_delta_prefix(&mut writer, truncate_at, ctx)
+                .await
+                .unwrap();
+
+            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
+
+            copied_layer.as_delta(ctx).await.unwrap();
+
+            assert_keys_and_values_eq(
+                new_layer.as_delta(ctx).await.unwrap(),
+                copied_layer.as_delta(ctx).await.unwrap(),
+                truncate_at,
+                ctx,
+            )
+            .await;
+        }
+    }
+
+    async fn assert_keys_and_values_eq(
+        source: &DeltaLayerInner,
+        truncated: &DeltaLayerInner,
+        truncated_at: Lsn,
+        ctx: &RequestContext,
+    ) {
+        use futures::future::ready;
+        use futures::stream::TryStreamExt;
+
+        let start_key = [0u8; DELTA_KEY_SIZE];
+
+        let source_reader = FileBlockReader::new(&source.file, source.file_id);
+        let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            source.index_start_blk,
+            source.index_root_blk,
+            &source_reader,
+        );
+        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
+        let source_stream = source_stream.filter(|res| match res {
+            Ok((_, lsn, _)) => ready(lsn < &truncated_at),
+            _ => ready(true),
+        });
+        let mut source_stream = std::pin::pin!(source_stream);
+
+        let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
+        let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            truncated.index_start_blk,
+            truncated.index_root_blk,
+            &truncated_reader,
+        );
+        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
+        let mut truncated_stream = std::pin::pin!(truncated_stream);
+
+        let mut scratch_left = Vec::new();
+        let mut scratch_right = Vec::new();
+
+        loop {
+            let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
+            let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
+
+            if src.is_none() {
+                assert!(truncated.is_none());
+                break;
+            }
+
+            let (src, truncated) = (src.unwrap(), truncated.unwrap());
+
+            // because we've filtered the source with Lsn, we should always have the same keys from both.
+            assert_eq!(src.0, truncated.0);
+            assert_eq!(src.1, truncated.1);
+
+            // if this is needed for something else, just drop this assert.
+            assert!(
+                src.2.pos() >= truncated.2.pos(),
+                "value position should not go backwards {} vs. {}",
+                src.2.pos(),
+                truncated.2.pos()
+            );
+
+            scratch_left.clear();
+            let src_cursor = source_reader.block_cursor();
+            let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
+            scratch_right.clear();
+            let trunc_cursor = truncated_reader.block_cursor();
+            let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
+
+            tokio::try_join!(left, right).unwrap();
+
+            assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
+        }
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 27e60f783c..291da0f645 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -116,6 +116,12 @@ impl AsLayerDesc for Layer {
     }
 }
 
+impl PartialEq for Layer {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
+    }
+}
+
 impl Layer {
     /// Creates a layer value for a file we know to not be resident.
     pub(crate) fn for_evicted(
@@ -1752,6 +1758,28 @@ impl ResidentLayer {
         }
     }
 
+    /// FIXME: truncate is bad name because we are not truncating anything, but copying the
+    /// filtered parts.
+    #[cfg(test)]
+    pub(super) async fn copy_delta_prefix(
+        &self,
+        writer: &mut super::delta_layer::DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use LayerKind::*;
+
+        let owner = &self.owner.0;
+
+        match self.downloaded.get(owner, ctx).await? {
+            Delta(ref d) => d
+                .copy_prefix(writer, truncate_at, ctx)
+                .await
+                .with_context(|| format!("truncate {self}")),
+            Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
+        }
+    }
+
     pub(crate) fn local_path(&self) -> &Utf8Path {
         &self.owner.0.path
     }
@@ -1761,14 +1789,14 @@ impl ResidentLayer {
     }
 
     #[cfg(test)]
-    pub(crate) async fn get_inner_delta<'a>(
-        &'a self,
+    pub(crate) async fn as_delta(
+        &self,
         ctx: &RequestContext,
-    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
-        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            LayerKind::Delta(d) => Ok(d),
-            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
+    ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
+        use LayerKind::*;
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(ref d) => Ok(d),
+            Image(_) => Err(anyhow::anyhow!("image layer")),
         }
     }
 }
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 3a6950cf88..91934d5e0e 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,18 +61,18 @@ pub struct VectoredRead {
 }
 
 impl VectoredRead {
-    pub fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 }
 
 #[derive(Eq, PartialEq)]
-enum VectoredReadExtended {
+pub(crate) enum VectoredReadExtended {
     Yes,
     No,
 }
 
-struct VectoredReadBuilder {
+pub(crate) struct VectoredReadBuilder {
     start: u64,
     end: u64,
     blobs_at: VecMap<u64, BlobMeta>,
@@ -80,7 +80,17 @@ struct VectoredReadBuilder {
 }
 
 impl VectoredReadBuilder {
-    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+    /// Start building a new vectored read.
+    ///
+    /// Note that by design, this does not check against reading more than `max_read_size` to
+    /// support reading larger blobs than the configuration value. The builder will be single use
+    /// however after that.
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+    ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
             .append(start_offset, meta)
@@ -97,7 +107,8 @@ impl VectoredReadBuilder {
     /// Attempt to extend the current read with a new blob if the start
     /// offset matches with the current end of the vectored read
     /// and the resuting size is below the max read size
-    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        tracing::trace!(start, end, "trying to extend");
         let size = (end - start) as usize;
         if self.end == start && self.size() + size <= self.max_read_size {
             self.end = end;
@@ -111,11 +122,11 @@ impl VectoredReadBuilder {
         VectoredReadExtended::No
     }
 
-    fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 
-    fn build(self) -> VectoredRead {
+    pub(crate) fn build(self) -> VectoredRead {
         VectoredRead {
             start: self.start,
             end: self.end,
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index ae2d996879..02f6f49694 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -55,6 +55,7 @@ impl NeonWalRecord {
     /// Does replaying this WAL record initialize the page from scratch, or does
     /// it need to be applied over the previous image of the page?
     pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
         match self {
             NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
 

From 637ad4a6380000ad5af17726deccea6bc963efab Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Apr 2024 13:16:03 +0100
Subject: [PATCH 040/157] pageserver: fix secondary download scheduling (#7396)

## Problem

Some tenants were observed to stop doing downloads after some time

## Summary of changes

- Fix a rogue `<` that was incorrectly scheduling work when `now` was
_before_ the scheduling target, rather than after. This usually resulted
in too-frequent execution, but could also result in never executing, if
the current time has advanced ahead of `next_download` at the time we
call `schedule()`.
- Fix in-memory list of timelines not being amended after timeline
deletion: the resulted in repeated harmless logs about the timeline
being removed, and redundant calls to remove_dir_all for the timeline
path.
- Add a log at startup to make it easier to see a particular tenant
starting in secondary mode (this is for parity with the logging that
exists when spawning an attached tenant). Previously searching on tenant
ID didn't provide a clear signal as to how the tenant was started during
pageserver start.
- Add a test that exercises secondary downloads using the background
scheduling, whereas existing tests were using the API hook to invoke
download directly.
---
 pageserver/src/metrics.rs                     |  6 +-
 pageserver/src/tenant/mgr.rs                  | 19 ++--
 pageserver/src/tenant/secondary/downloader.rs | 11 ++-
 .../regress/test_pageserver_secondary.py      | 86 +++++++++++++++++++
 4 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index be61a755ff..e6db95082b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1518,7 +1518,8 @@ pub(crate) struct SecondaryModeMetrics {
     pub(crate) download_heatmap: IntCounter,
     pub(crate) download_layer: IntCounter,
 }
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
+    SecondaryModeMetrics {
     upload_heatmap: register_int_counter!(
         "pageserver_secondary_upload_heatmap",
         "Number of heatmaps written to remote storage by attached tenants"
@@ -1536,7 +1537,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
     .expect("failed to define a metric"),
     download_heatmap: register_int_counter!(
         "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
+        "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
     )
     .expect("failed to define a metric"),
     download_layer: register_int_counter!(
@@ -1544,6 +1545,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
         "Number of downloads of layers by secondary mode locations"
     )
     .expect("failed to define a metric"),
+}
 });
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 73967f2949..2c9476ba0a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -678,12 +678,19 @@ pub async fn init_tenant_mgr(
                     }
                 }
             }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
-                tenant_shard_id,
-                shard_identity,
-                location_conf.tenant_conf,
-                &secondary_conf,
-            )),
+            LocationMode::Secondary(secondary_conf) => {
+                info!(
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug(),
+                    "Starting secondary tenant"
+                );
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    location_conf.tenant_conf,
+                    &secondary_conf,
+                ))
+            }
         };
 
         tenants.insert(tenant_shard_id, slot);
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5b29c126d1..67f866cb7b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     (detail.last_download, detail.next_download.unwrap())
                 };
 
-                if now < next_download {
+                if now > next_download {
                     Some(PendingDownload {
                         secondary_state: secondary_tenant,
                         last_download,
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
                 progress.bytes_downloaded += layer_byte_count;
                 progress.layers_downloaded += layer_count;
             }
+
+            for delete_timeline in &delete_timelines {
+                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
+                // from disk fails that will be a fatal error.
+                detail.timelines.remove(delete_timeline);
+            }
         }
 
         // Execute accumulated deletions
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
                     .await
                     .map_err(UpdateError::from)?;
 
+                SECONDARY_MODE.download_heatmap.inc();
+
                 if Some(&download.etag) == prev_etag {
                     Ok(HeatMapDownload::Unmodified)
                 } else {
                     let mut heatmap_bytes = Vec::new();
                     let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                     let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                    SECONDARY_MODE.download_heatmap.inc();
                     Ok(HeatMapDownload::Modified(HeatMapModified {
                         etag: download.etag,
                         last_modified: download.last_modified,
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 345abdc072..8f194e5dda 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,6 +1,7 @@
 import json
 import os
 import random
+import time
 from pathlib import Path
 from typing import Any, Dict, Optional
 
@@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
 
 
+def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Slow test that runs in realtime, checks that the background scheduling of secondary
+    downloads happens as expected.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Create this many tenants, each with two timelines
+    tenant_count = 4
+    tenant_timelines = {}
+
+    # This mirrors a constant in `downloader.rs`
+    freshen_interval_secs = 60
+
+    for _i in range(0, tenant_count):
+        tenant_id = TenantId.generate()
+        timeline_a = TimelineId.generate()
+        timeline_b = TimelineId.generate()
+        env.neon_cli.create_tenant(
+            tenant_id,
+            timeline_a,
+            placement_policy='{"Attached":1}',
+            # Run with a low heatmap period so that we can avoid having to do synthetic API calls
+            # to trigger the upload promptly.
+            conf={"heatmap_period": "1s"},
+        )
+        env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
+
+        tenant_timelines[tenant_id] = [timeline_a, timeline_b]
+
+    t_start = time.time()
+
+    # Wait long enough that the background downloads should happen; we expect all the inital layers
+    # of all the initial timelines to show up on the secondary location of each tenant.
+    time.sleep(freshen_interval_secs * 1.5)
+
+    for tenant_id, timelines in tenant_timelines.items():
+        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+        ps_attached = env.get_pageserver(attached_to_id)
+        # We only have two: the other one must be secondary
+        ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+        for timeline_id in timelines:
+            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            # One or more layers should be present for all timelines
+            assert list_layers(ps_secondary, tenant_id, timeline_id)
+
+        # Delete the second timeline: this should be reflected later on the secondary
+        env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
+
+    # Wait long enough for the secondary locations to see the deletion
+    time.sleep(freshen_interval_secs * 1.5)
+
+    for tenant_id, timelines in tenant_timelines.items():
+        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+        ps_attached = env.get_pageserver(attached_to_id)
+        # We only have two: the other one must be secondary
+        ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+        # This one was not deleted
+        assert list_layers(ps_secondary, tenant_id, timelines[0])
+
+        # This one was deleted
+        assert not list_layers(ps_secondary, tenant_id, timelines[1])
+
+    t_end = time.time()
+
+    # Measure how many heatmap downloads we did in total: this checks that we succeeded with
+    # proper scheduling, and not some bug that just runs downloads in a loop.
+    total_heatmap_downloads = 0
+    for ps in env.pageservers:
+        v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total")
+        assert v is not None
+        total_heatmap_downloads += int(v)
+
+    download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
+
+    expect_download_rate = 1.0 / freshen_interval_secs
+    log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
+
+    assert download_rate < expect_download_rate * 2
+
+
 @pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
 @pytest.mark.parametrize("via_controller", [True, False])
 def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):

From 0d8e68003a9ef5bb628a245a66b915322824dd44 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 18 Apr 2024 14:45:25 +0100
Subject: [PATCH 041/157] Add a docs page for storage controller (#7392)

## Problem

External contributors need information on how to use the storage
controller.

## Summary of changes

- Background content on what the storage controller is.
- Deployment information on how to use it.

This is not super-detailed, but should be enough for a well motivated
third party to get started, with an occasional peek at the code.
---
 docs/storage_controller.md | 150 +++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 docs/storage_controller.md

diff --git a/docs/storage_controller.md b/docs/storage_controller.md
new file mode 100644
index 0000000000..4cb796edaa
--- /dev/null
+++ b/docs/storage_controller.md
@@ -0,0 +1,150 @@
+# Storage Controller
+
+## Concepts
+
+The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
+which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
+
+It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
+the underlying details of how data is spread across multiple nodes.
+
+The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
+
+## APIs
+
+The storage controller’s HTTP server implements four logically separate APIs:
+
+- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
+- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
+- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
+- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
+  to ensure data safety with generation numbers.
+
+The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
+
+See the `http.rs` file in the source for where the HTTP APIs are implemented.
+
+## Database
+
+The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
+persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
+rebuilt on startup.
+
+The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+
+The `diesel` crate is used for defining models & migrations.
+
+Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
+
+### Diesel tip: migrations
+
+If you need to modify the database schema, here’s how to create a migration:
+
+- Install the diesel CLI with `cargo install diesel_cli`
+- Use `diesel migration generate <name>` to create a new migration
+- Populate the SQL files in the `migrations/` subdirectory
+- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+- Commit the migration files and the changes to schema.rs
+- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
+- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
+
+## storcon_cli
+
+The `storcon_cli` tool enables interactive management of the storage controller. This is usually
+only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
+
+`storcon_cli --help` includes details on commands.
+
+# Deploying
+
+This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
+part of a self-hosted system.
+
+_General note: since the default `neon_local` environment includes a storage controller, this is a useful
+reference when figuring out deployment._
+
+## Database
+
+It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
+local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
+
+The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
+
+Set the URL to the database using the `--database-url` CLI option.
+
+There is no need to run migrations manually: the storage controller automatically applies migrations
+when it starts up.
+
+## Configure pageservers to use the storage controller
+
+1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
+   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
+2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
+   with the storage controller when it starts up. See the example below for the format of this file.
+
+### Example `metadata.json`
+
+```
+{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
+```
+
+- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
+  postgres runs.
+- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
+  the storage controller runs.
+
+## Handle compute notifications.
+
+The storage controller independently moves tenant attachments between pageservers in response to
+changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
+postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
+location changes.
+
+The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
+JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
+
+In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
+the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
+the compute hook.
+
+When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
+the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
+
+```
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+```
+
+When a notification is received:
+
+1. Modify postgres configuration for this tenant:
+
+   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
+     shards identified by `NodeId` must be converted to the address+port of the node.
+   - if stripe_size is not None, set `neon.stripe_size` to this value
+
+2. Send SIGHUP to postgres to reload configuration
+3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
+   will retry the notification until it succeeds..
+
+### Example notification body
+
+```
+{
+  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
+  "stripe_size": 32768,
+  "shards": [
+      {"node_id": 344, "shard_number": 0},
+      {"node_id": 722, "shard_number": 1},
+  ],
+}
+```

From 3df67bf4d7d23a074cd0e45104e86ebc36315242 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Apr 2024 18:27:58 +0300
Subject: [PATCH 042/157] fix(Layer): metric regression with too many canceled
 evictions (#7363)

#7030 introduced an annoying papercut, deeming a failure to acquire a
strong reference to `LayerInner` from `DownloadedLayer::drop` as a
canceled eviction. Most of the time, it wasn't that, but just timeline
deletion or tenant detach with the layer not wanting to be deleted or
evicted.

When a Layer is dropped as part of a normal shutdown, the `Layer` is
dropped first, and the `DownloadedLayer` the second. Because of this, we
cannot detect eviction being canceled from the `DownloadedLayer::drop`.
We can detect it from `LayerInner::drop`, which this PR adds.

Test case is added which before had 1 started eviction, 2 canceled. Now
it accurately finds 1 started, 1 canceled.
---
 libs/utils/src/sync/heavier_once_cell.rs      | 51 +++++++++-
 pageserver/src/tenant/storage_layer/layer.rs  | 16 ++-
 .../src/tenant/storage_layer/layer/tests.rs   | 97 +++++++++++++++++++
 3 files changed, 155 insertions(+), 9 deletions(-)

diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 8eee1f72a6..1abd3d9861 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
+    /// initialized.
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let inner = self.inner.get_mut().unwrap();
+
+        inner.take_and_deinit()
+    }
+
     /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
     pub fn initializer_count(&self) -> usize {
         self.initializers.load(Ordering::Relaxed)
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+        self.0
+            .take_and_deinit()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> Inner<T> {
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let value = self.value.take()?;
+
         let mut swapped = Inner::default();
         let sem = swapped.init_semaphore.clone();
         // acquire and forget right away, moving the control over to InitPermit
         sem.try_acquire().expect("we just created this").forget();
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, InitPermit(sem)))
-            .expect("guard is not created unless value has been initialized")
+        let permit = InitPermit(sem);
+        std::mem::swap(self, &mut swapped);
+        Some((value, permit))
     }
 }
 
@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);
 
+impl std::fmt::Debug for InitPermit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let ptr = Arc::as_ptr(&self.0) as *const ();
+        f.debug_tuple("InitPermit").field(&ptr).finish()
+    }
+}
+
 impl Drop for InitPermit {
     fn drop(&mut self) {
         assert_eq!(
@@ -559,4 +582,22 @@ mod tests {
 
         assert_eq!(*target.get().unwrap(), 11);
     }
+
+    #[tokio::test]
+    async fn take_and_deinit_on_mut() {
+        use std::convert::Infallible;
+
+        let mut target = OnceCell::<u32>::default();
+        assert!(target.take_and_deinit().is_none());
+
+        target
+            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
+            .await
+            .unwrap();
+
+        let again = target.take_and_deinit();
+        assert!(matches!(again, Some((42, _))), "{again:?}");
+
+        assert!(target.take_and_deinit().is_none());
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 291da0f645..e55299f0fa 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -610,9 +610,17 @@ enum Status {
 
 impl Drop for LayerInner {
     fn drop(&mut self) {
+        // if there was a pending eviction, mark it cancelled here to balance metrics
+        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
+        {
+            // eviction has already been started
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+
+            // eviction request is intentionally not honored as no one is present to wait for it
+            // and we could be delaying shutdown for nothing.
+        }
+
         if !*self.wanted_deleted.get_mut() {
-            // should we try to evict if the last wish was for eviction? seems more like a hazard
-            // than a clear win.
             return;
         }
 
@@ -1558,8 +1566,8 @@ impl Drop for DownloadedLayer {
         if let Some(owner) = self.owner.upgrade() {
             owner.on_downloaded_layer_drop(self.version);
         } else {
-            // no need to do anything, we are shutting down
-            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+            // Layer::drop will handle cancelling the eviction; because of drop order and
+            // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 247ff123b5..f0697fdf28 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -721,6 +721,103 @@ async fn evict_and_wait_does_not_wait_for_download() {
     layer.evict_and_wait(FOREVER).await.unwrap();
 }
 
+/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
+/// which is the last value.
+///
+/// Also checks that the same does not happen on a non-evicted layer (regression test).
+#[tokio::test(start_paused = true)]
+async fn eviction_cancellation_on_drop() {
+    use crate::repository::Value;
+    use bytes::Bytes;
+
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = tokio::runtime::Handle::current();
+
+    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    {
+        // create_test_timeline wrote us one layer, write another
+        let mut writer = timeline.writer().await;
+        writer
+            .put(
+                Key::from_i128(5),
+                Lsn(0x20),
+                &Value::Image(Bytes::from_static(b"this does not matter either")),
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        writer.finish_write(Lsn(0x20));
+    }
+
+    timeline.freeze_and_flush().await.unwrap();
+
+    // wait for the upload to complete so our Arc::strong_count assertion holds
+    timeline
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .wait_completion()
+        .await
+        .unwrap();
+
+    let (evicted_layer, not_evicted) = {
+        let mut layers = {
+            let mut guard = timeline.layers.write().await;
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            // remove the layers from layermap
+            guard.finish_gc_timeline(&layers);
+
+            layers
+        };
+
+        assert_eq!(layers.len(), 2);
+
+        (layers.pop().unwrap(), layers.pop().unwrap())
+    };
+
+    let victims = [(evicted_layer, true), (not_evicted, false)];
+
+    for (victim, evict) in victims {
+        let resident = victim.keep_resident().await.unwrap();
+        drop(victim);
+
+        assert_eq!(Arc::strong_count(&resident.owner.0), 1);
+
+        if evict {
+            let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
+
+            // drive the future to await on the status channel, and then drop it
+            tokio::time::timeout(ADVANCE, evict_and_wait)
+                .await
+                .expect_err("should had been a timeout since we are holding the layer resident");
+        }
+
+        // 1 == we only evict one of the layers
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        drop(resident);
+
+        // run any spawned
+        tokio::time::sleep(ADVANCE).await;
+
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
+        );
+    }
+}
+
 #[test]
 fn layer_size() {
     assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);

From 681a04d2874514a2fae4fd0a11114ecb48c42280 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:47:34 +0000
Subject: [PATCH 043/157] build(deps): bump aiohttp from 3.9.2 to 3.9.4 (#7429)

---
 poetry.lock    | 156 ++++++++++++++++++++++++-------------------------
 pyproject.toml |   2 +-
 2 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index aca88073a8..6ed64d28fc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@
 
 [[package]]
 name = "aiohttp"
-version = "3.9.2"
+version = "3.9.4"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
-    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
+    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
 ]
 
 [package.dependencies]
@@ -2900,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
+content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
diff --git a/pyproject.toml b/pyproject.toml
index 156f135062..aadcf26818 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.2"
+aiohttp = "3.9.4"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From 6eb946e2ded051d2d8f6b2c545d67288212e6dab Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 18 Apr 2024 18:40:30 +0100
Subject: [PATCH 044/157] pageserver: fix cont lsn jump on vectored read path
 (#7412)

## Problem
Vectored read path may return an image that's newer than the request lsn
under certain circumstances.
```
  LSN
    ^
    |
    |
500 | ------------------------- -> branch point
400 |        X
300 |        X
200 | ------------------------------------> requested lsn
100 |        X
    |---------------------------------> Key

Legend:
* X - page images
```

The vectored read path inspects each ancestor timeline one by one
starting from the current one.
When moving into the ancestor timeline, the current code resets the
current search lsn (called `cont_lsn` in code)
to the lsn of the ancestor timeline
([here](https://github.com/neondatabase/neon/blob/d5708e74357ca19146098770895356326542306e/pageserver/src/tenant/timeline.rs#L2971)).

For instance, if the request lsn was 200, we would:
1. Look into the current timeline and find nothing for the key
2. Descend into the ancestor timeline and set `cont_lsn=500`
3. Return the page image at LSN 400

Myself and Christian find it very unlikely for this to have happened in
prod since the vectored read path
is always used at the last record lsn.

This issue was found by a regress test during the work to migrate get
page handling to use the vectored
implementation. I've applied my fix to that wip branch and it fixed the
issue.

## Summary of changes
The fix is to set the current search lsn to the min between the
requested LSN and the ancestor lsn.
Hence, at step 2 above we would set the current search lsn to 200 and
ignore the images above that.

A test illustrating the bug is also included. Fails without the patch
and passes with it.
---
 pageserver/src/tenant.rs          | 164 +++++++++++++++++++++++++++++-
 pageserver/src/tenant/timeline.rs |   3 +-
 2 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 35ea037a55..ff17400d45 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3848,6 +3848,8 @@ pub(crate) mod harness {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeMap;
+
     use super::*;
     use crate::keyspace::KeySpaceAccum;
     use crate::repository::{Key, Value};
@@ -3858,7 +3860,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
-    use tests::timeline::ShutdownMode;
+    use tests::timeline::{GetVectoredError, ShutdownMode};
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4794,6 +4796,166 @@ mod tests {
         Ok(())
     }
 
+    // Test that vectored get descends into ancestor timelines correctly and
+    // does not return an image that's newer than requested.
+    //
+    // The diagram below ilustrates an interesting case. We have a parent timeline
+    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
+    // from the child timeline, so the parent timeline must be visited. When advacing into
+    // the child timeline, the read path needs to remember what the requested Lsn was in
+    // order to avoid returning an image that's too new. The test below constructs such
+    // a timeline setup and does a few queries around the Lsn of each page image.
+    // ```
+    //    LSN
+    //     ^
+    //     |
+    //     |
+    // 500 | --------------------------------------> branch point
+    // 400 |        X
+    // 300 |        X
+    // 200 | --------------------------------------> requested lsn
+    // 100 |        X
+    //     |---------------------------------------> Key
+    //              |
+    //              ------> requested key
+    //
+    // Legend:
+    // * X - page images
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(1000);
+        let child_gap_at_key = start_key.add(500);
+        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
+
+        let mut current_lsn = Lsn(0x10);
+
+        let timeline_id = TimelineId::generate();
+        let parent_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        for _ in 0..3 {
+            let mut key = start_key;
+            while key < end_key {
+                current_lsn += 0x10;
+
+                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
+
+                let mut writer = parent_timeline.writer().await;
+                writer
+                    .put(
+                        key,
+                        current_lsn,
+                        &Value::Image(test_img(&image_value)),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(current_lsn);
+
+                if key == child_gap_at_key {
+                    parent_gap_lsns.insert(current_lsn, image_value);
+                }
+
+                key = key.next();
+            }
+
+            parent_timeline.freeze_and_flush().await?;
+        }
+
+        let child_timeline_id = TimelineId::generate();
+
+        let child_timeline = tenant
+            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
+            .await?;
+
+        let mut key = start_key;
+        while key < end_key {
+            if key == child_gap_at_key {
+                key = key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let mut writer = child_timeline.writer().await;
+            writer
+                .put(
+                    key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+
+            key = key.next();
+        }
+
+        child_timeline.freeze_and_flush().await?;
+
+        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
+        let mut query_lsns = Vec::new();
+        for image_lsn in parent_gap_lsns.keys().rev() {
+            for offset in lsn_offsets {
+                query_lsns.push(Lsn(image_lsn
+                    .0
+                    .checked_add_signed(offset)
+                    .expect("Shouldn't overflow")));
+            }
+        }
+
+        for query_lsn in query_lsns {
+            let results = child_timeline
+                .get_vectored_impl(
+                    KeySpace {
+                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                    },
+                    query_lsn,
+                    &ctx,
+                )
+                .await;
+
+            let expected_item = parent_gap_lsns
+                .iter()
+                .rev()
+                .find(|(lsn, _)| **lsn <= query_lsn);
+
+            info!(
+                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
+                query_lsn, expected_item
+            );
+
+            match expected_item {
+                Some((_, img_value)) => {
+                    let key_results = results.expect("No vectored get error expected");
+                    let key_result = &key_results[&child_gap_at_key];
+                    let returned_img = key_result
+                        .as_ref()
+                        .expect("No page reconstruct error expected");
+
+                    info!(
+                        "Vectored read at LSN {} returned image {}",
+                        query_lsn,
+                        std::str::from_utf8(returned_img)?
+                    );
+                    assert_eq!(*returned_img, test_img(img_value));
+                }
+                None => {
+                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_random_updates")?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 46b3d41e2b..3f2d807ce8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2968,7 +2968,8 @@ impl Timeline {
                 break;
             }
 
-            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            // Take the min to avoid reconstructing a page with data newer than request Lsn.
+            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
                 .get_ready_ancestor_timeline(ctx)
                 .await

From 98be8b94308135c19e49696141b41e86d90cb973 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 19 Apr 2024 12:32:58 +0100
Subject: [PATCH 045/157] storcon_cli: `tenant-warmup` command (#7432)

## Problem

When we migrate a large existing tenant, we would like to be able to
ensure it has pre-loaded layers onto a pageserver managed by the storage
controller.

## Summary of changes

- Add `storcon_cli tenant-warmup`, which configures the tenant into
PlacementPolicy::Secondary (unless it's already attached), and then
polls the secondary download API reporting progress.
- Extend a test case to check that when onboarding with a secondary
location pre-created, we properly use that location for our first
attachment.
---
 control_plane/storcon_cli/src/main.rs         | 102 +++++++++++++++++-
 .../regress/test_storage_controller.py        |  25 ++++-
 2 files changed, 120 insertions(+), 7 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2edd09eac1..b3d1f0be05 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,15 +1,15 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
-use hyper::Method;
+use hyper::{Method, StatusCode};
 use pageserver_api::{
     controller_api::{
         NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -120,6 +120,12 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
+    /// mode so that it can warm up content on a pageserver.
+    TenantWarmup {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
 }
 
 #[derive(Parser)]
@@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::TenantWarmup { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await;
+            match describe_response {
+                Ok(describe) => {
+                    if matches!(describe.policy, PlacementPolicy::Secondary) {
+                        // Fine: it's already known to controller in secondary mode: calling
+                        // again to put it into secondary mode won't cause problems.
+                    } else {
+                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
+                    }
+                }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
+                    // Fine: this tenant isn't know to the storage controller yet.
+                }
+                Err(e) => {
+                    // Unexpected API error
+                    return Err(e.into());
+                }
+            }
+
+            vps_client
+                .location_config(
+                    TenantShardId::unsharded(tenant_id),
+                    pageserver_api::models::LocationConfig {
+                        mode: pageserver_api::models::LocationConfigMode::Secondary,
+                        generation: None,
+                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+                        shard_number: 0,
+                        shard_count: 0,
+                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
+                        tenant_conf: TenantConfig::default(),
+                    },
+                    None,
+                    true,
+                )
+                .await?;
+
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            let secondary_ps_id = describe_response
+                .shards
+                .first()
+                .unwrap()
+                .node_secondary
+                .first()
+                .unwrap();
+
+            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
+            loop {
+                let (status, progress) = vps_client
+                    .tenant_secondary_download(
+                        TenantShardId::unsharded(tenant_id),
+                        Some(Duration::from_secs(10)),
+                    )
+                    .await?;
+                println!(
+                    "Progress: {}/{} layers, {}/{} bytes",
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                match status {
+                    StatusCode::OK => {
+                        println!("Download complete");
+                        break;
+                    }
+                    StatusCode::ACCEPTED => {
+                        // Loop
+                    }
+                    _ => {
+                        anyhow::bail!("Unexpected download status: {status}");
+                    }
+                }
+            }
+        }
     }
 
     Ok(())
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 840f354142..b4b23745f8 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -273,7 +273,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     but imports the generation number.
     """
 
-    neon_env_builder.num_pageservers = 2
+    # One pageserver to simulate legacy environment, two to be managed by storage controller
+    neon_env_builder.num_pageservers = 3
 
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
@@ -288,10 +289,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     origin_ps = env.pageservers[0]
 
-    # This is the pageserver managed by the sharding service, where the tenant
+    # These are the pageservers managed by the sharding service, where the tenant
     # will be attached after onboarding
     env.pageservers[1].start()
-    dest_ps = env.pageservers[1]
+    env.pageservers[2].start()
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
@@ -330,6 +331,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
         )
 
         virtual_ps_http.tenant_secondary_download(tenant_id)
+        warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "node_secondary"
+        ][0]
 
     # Call into storage controller to onboard the tenant
     generation += 1
@@ -344,6 +348,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     assert len(r["shards"]) == 1
 
+    describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
+    dest_ps_id = describe["node_attached"]
+    dest_ps = env.get_pageserver(dest_ps_id)
+    if warm_up:
+        # The storage controller should have attached the tenant to the same placce
+        # it had a secondary location, otherwise there was no point warming it up
+        assert dest_ps_id == warm_up_ps
+
+        # It should have been given a new secondary location as well
+        assert len(describe["node_secondary"]) == 1
+        assert describe["node_secondary"][0] != warm_up_ps
+
     # As if doing a live migration, detach the original pageserver
     origin_ps.http_client().tenant_location_conf(
         tenant_id,
@@ -415,6 +431,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
         dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
     )
     dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+
+    # Storage controller auto-sets heatmap period, ignore it for the comparison
+    del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
     assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
 
     env.storage_controller.consistency_check()

From e8a98adcd0a06a8c50c3483d7109e252f4d4d4e0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 21 Apr 2024 11:56:34 +0100
Subject: [PATCH 046/157] CI: downgrade docker/setup-buildx-action to v2

- Cleanup part for `docker/setup-buildx-action` started to fail with the following error (for no obvious reason):
```
/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175
            throw new Error(`Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.`);
^
Error: Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.
    at Object.rejected (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175:1)
    at Generator.next (<anonymous>)
    at fulfilled (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:29:1)
```

- Downgrade `docker/setup-buildx-action` from v3 to v2
---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1d35fa9223..c395b36c21 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -735,7 +735,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
 
       - uses: docker/login-action@v3
         with:
@@ -792,7 +792,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
         with:
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.

From 0d21187322591412fbf7309d9e8780d660a9bf60 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 20 Apr 2024 12:37:58 +0300
Subject: [PATCH 047/157] update rustls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

`cargo deny check` is complaining about our rustls versions, causing
CI to fail:

```
error[vulnerability]: `rustls::ConnectionCommon::complete_io` could fall into an infinite loop based on network input
    ┌─ /__w/neon/neon/Cargo.lock:395:1
    │
395 │ rustls 0.21.9 registry+https://github.com/rust-lang/crates.io-index
    │ ------------------------------------------------------------------- security vulnerability detected
    │
    = ID: RUSTSEC-2024-0336
    = Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0336
    = If a `close_notify` alert is received during a handshake, `complete_io`
      does not terminate.

      Callers which do not call `complete_io` are not affected.

      `rustls-tokio` and `rustls-ffi` do not call `complete_io`
      and are not affected.

      `rustls::Stream` and `rustls::StreamOwned` types use
      `complete_io` and are affected.
    = Announcement: https://github.com/rustls/rustls/security/advisories/GHSA-6g7w-8wpp-frhj
    = Solution: Upgrade to >=0.23.5 OR >=0.22.4, <0.23.0 OR >=0.21.11, <0.22.0 (try `cargo update -p rustls`)

error[vulnerability]: `rustls::ConnectionCommon::complete_io` could fall into an infinite loop based on network input
    ┌─ /__w/neon/neon/Cargo.lock:396:1
    │
396 │ rustls 0.22.2 registry+https://github.com/rust-lang/crates.io-index
    │ ------------------------------------------------------------------- security vulnerability detected
    │
    = ID: RUSTSEC-2024-0336
    = Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0336
    = If a `close_notify` alert is received during a handshake, `complete_io`
      does not terminate.

      Callers which do not call `complete_io` are not affected.

      `rustls-tokio` and `rustls-ffi` do not call `complete_io`
      and are not affected.

      `rustls::Stream` and `rustls::StreamOwned` types use
      `complete_io` and are affected.
    = Announcement: https://github.com/rustls/rustls/security/advisories/GHSA-6g7w-8wpp-frhj
    = Solution: Upgrade to >=0.23.5 OR >=0.22.4, <0.23.0 OR >=0.21.11, <0.22.0 (try `cargo update -p rustls`)
```

## Summary of changes

`cargo update -p rustls@0.21.9 -p rustls@0.22.2`
---
 Cargo.lock | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6faf4b72f0..76183bdaab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,7 +599,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "tokio",
  "tracing",
 ]
@@ -2519,7 +2519,7 @@ dependencies = [
  "http 0.2.9",
  "hyper 0.14.26",
  "log",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
@@ -4059,7 +4059,7 @@ dependencies = [
  "futures",
  "once_cell",
  "pq_proto",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
@@ -4350,7 +4350,7 @@ dependencies = [
  "routerify",
  "rstest",
  "rustc-hash",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4542,7 +4542,7 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-native-certs 0.7.0",
  "rustls-pemfile 2.1.1",
  "rustls-pki-types",
@@ -4696,7 +4696,7 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
@@ -4956,9 +4956,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.9"
+version = "0.21.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
  "log",
  "ring 0.17.6",
@@ -4968,9 +4968,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
  "ring 0.17.6",
@@ -5282,7 +5282,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
  "httpdate",
  "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -6193,7 +6193,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
  "ring 0.17.6",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "tokio",
  "tokio-postgres",
  "tokio-rustls 0.25.0",
@@ -6206,7 +6206,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "tokio",
 ]
 
@@ -6216,7 +6216,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6677,7 +6677,7 @@ dependencies = [
  "base64 0.21.1",
  "log",
  "once_cell",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-webpki 0.100.2",
  "url",
  "webpki-roots 0.23.1",
@@ -7354,7 +7354,7 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "scopeguard",
  "serde",
  "serde_json",

From 35e9fb360b4a0c51a88f98ffaf1c252f2f0850a5 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Sun, 21 Apr 2024 17:35:01 -0700
Subject: [PATCH 048/157] Bump vm-builder v0.23.2 -> v0.28.1 (#7433)

Only one relevant change, from v0.28.0:

- neondatabase/autoscaling#887

Double-checked with `git log neonvm/tools/vm-builder`.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c395b36c21..a7e108fac4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -865,7 +865,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1
 
     steps:
       - name: Checkout

From 3a673dce67f0d5d9ab2163e9f4bd818bbc4b5375 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 22 Apr 2024 10:58:10 +0300
Subject: [PATCH 049/157] Make test less sensitive to exact WAL positions
 (#7436)

As noted in the comment, the craft_internal() function fails if the
inserted WAL happens to land at page boundary. I bumped into that with
PR #7377; it changed the arguments of a few SQL functions in
neon_test_utils extension, which changed the WAL positions slightly, and
caused a test failure.
---
 libs/postgres_ffi/src/lib.rs           |  4 +++-
 libs/postgres_ffi/wal_craft/src/lib.rs | 22 +++++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index aa6845b9b1..0d6986778a 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 
 pub use v14::bindings::{CheckPoint, ControlFileData};
 
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 23786e3b08..223ff08e8d 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
         intermediate_lsns.insert(0, initial_lsn);
     }
 
-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
     //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
+    // returns the position just after the page header on the next page. That's where the next
+    // record will be inserted. But the page header hasn't actually been written to the WAL
+    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
+    // error. Because of that, if the insert location is just after a page header, back off to
+    // previous page boundary.
+    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
+    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+    }
+    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
     Ok(intermediate_lsns)
 }
 

From 00d9c2d9a81491e1d159c85f6cd129b13755f9f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 22 Apr 2024 10:58:28 +0300
Subject: [PATCH 050/157] Make another walcraft test more robust (#7439)

There were two issues with the test at page boundaries:

1. If the first logical message with 10 bytes payload crossed a page
boundary, the calculated 'base_size' was too large because it included
the page header.

2. If it was inserted near the end of a page so that there was not
enough room for another one, we did "remaining_lsn += XLOG_BLCKSZ" but
that didn't take into account the page headers either.

As a result, the test would fail if the WAL insert position at the
beginning of the test was too close to the end of a WAL page. Fix the
calculations by repeating the 10-byte logical message if the starting
position is not suitable.

I bumped into this with PR #7377; it changed the arguments of a few SQL
functions in neon_test_utils extension, which changed the WAL positions
slightly, and caused a test failure.


This is similar to https://github.com/neondatabase/neon/pull/7436, but
for different test.
---
 libs/postgres_ffi/wal_craft/src/lib.rs | 63 +++++++++++++++-----------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 223ff08e8d..262068cbda 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -332,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 
         client.execute("CREATE table t(x int)", &[])?;
 
-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
-        // We will use logical message as the padding. We start with detecting how much WAL
-        // it takes for one logical message, considering all alignments and headers.
-        let base_wal_advance = {
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
+        // will use carefully-sized logical messages to advance WAL insert location such
+        // that there is just enough space on the page for the XLOG_SWITCH record.
+        loop {
+            // We start with measuring how much WAL it takes for one logical message,
+            // considering all alignments and headers.
             let before_lsn = client.pg_current_wal_insert_lsn()?;
-            // Small non-empty message bigger than few bytes is more likely than an empty
-            // message to have the same format as the big padding message.
             client.execute(
                 "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                 &[],
             )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
-                + XLOG_SIZE_OF_XLOG_RECORD
-        };
-        let mut remaining_lsn =
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
-        if remaining_lsn < base_wal_advance {
-            remaining_lsn += XLOG_BLCKSZ;
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
+
+            // Did the record cross a page boundary? If it did, start over. Crossing a
+            // page boundary adds to the apparent size of the record because of the page
+            // header, which throws off the calculation.
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
+            {
+                continue;
+            }
+            // base_size is the size of a logical message without the payload
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
+
+            // Is there enough space on the page for another logical message and an
+            // XLOG_SWITCH? If not, start over.
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+                continue;
+            }
+
+            // We will write another logical message, such that after the logical message
+            // record, there will be space for exactly one XLOG_SWITCH. How large should
+            // the logical message's payload be? An XLOG_SWITCH record has no data => its
+            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
+
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+                &[&(repeats as i32)],
+            )?;
+            break;
         }
-        let repeats = 10 + remaining_lsn - base_wal_advance;
-        info!(
-            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
-            client.pg_current_wal_insert_lsn()?,
-            remaining_lsn,
-            base_wal_advance,
-            repeats
-        );
-        client.execute(
-            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-            &[&(repeats as i32)],
-        )?;
         info!(
             "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
             client.pg_current_wal_insert_lsn()?,

From b91c58a8bf8b3e11451220fe3bb2a4479023fa45 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 22 Apr 2024 11:57:14 +0300
Subject: [PATCH 051/157] refactor(Timeline): simpler metadata updates (#7422)

Currently, any `Timeline::schedule_uploads` will generate a fresh
`TimelineMetadata` instead of updating the values, which it means to
update. This makes it impossible for #6994 to work while `Timeline`
receives layer flushes by overwriting any configured new
`ancestor_timeline_id` and possible `ancestor_lsn`.

The solution is to only make full `TimelineMetadata` "updates" from one
place: branching. At runtime, update only the three fields, same as
before in `Timeline::schedule_updates`.
---
 pageserver/src/tenant.rs                      |  5 +--
 pageserver/src/tenant/metadata.rs             | 27 ++++++++++++++++
 .../src/tenant/remote_timeline_client.rs      | 31 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             | 17 +++-------
 4 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ff17400d45..15be6df637 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -559,9 +559,10 @@ impl Tenant {
             // By doing what we do here, the index part upload is retried.
             // If control plane retries timeline creation in the meantime, the mgmt API handler
             // for timeline creation will coalesce on the upload we queue here.
+            // FIXME: this branch should be dead code as we no longer write local metadata.
             let rtc = timeline.remote_client.as_ref().unwrap();
             rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
         }
 
         timeline
@@ -3027,7 +3028,7 @@ impl Tenant {
         // See also https://github.com/neondatabase/neon/issues/3865
         if let Some(remote_client) = new_timeline.remote_client.as_ref() {
             remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                 .context("branch initial metadata upload")?;
         }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 1736950d1f..39da713479 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
         let bytes = instance.to_bytes().unwrap();
         Self::from_bytes(&bytes).unwrap()
     }
+
+    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
+        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
+        self.body.prev_record_lsn = update.prev_record_lsn;
+        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
+    }
 }
 
 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
     }
 }
 
+/// Parts of the metadata which are regularly modified.
+pub(crate) struct MetadataUpdate {
+    disk_consistent_lsn: Lsn,
+    prev_record_lsn: Option<Lsn>,
+    latest_gc_cutoff_lsn: Lsn,
+}
+
+impl MetadataUpdate {
+    pub(crate) fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        latest_gc_cutoff_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            latest_gc_cutoff_lsn,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 3879135f26..1fa3badefb 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -236,6 +236,7 @@ use utils::id::{TenantId, TimelineId};
 
 use self::index::IndexPart;
 
+use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
@@ -536,9 +537,10 @@ impl RemoteTimelineClient {
     // Upload operations.
     //
 
-    ///
     /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
+    ///
+    /// This should only be used to upload initial metadata to remote storage.
     ///
     /// The upload will be added to the queue immediately, but it
     /// won't be performed until all previously scheduled layer file
@@ -550,7 +552,7 @@ impl RemoteTimelineClient {
     /// If there were any changes to the list of files, i.e. if any
     /// layer file uploads were scheduled, since the last index file
     /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
         self: &Arc<Self>,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
@@ -566,6 +568,27 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, with only parts of the metadata
+    /// updated.
+    ///
+    /// This is the regular way of updating metadata on layer flushes or Gc.
+    ///
+    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
+    /// `index_part.json`, while being more clear on what values update regularly.
+    pub(crate) fn schedule_index_upload_for_metadata_update(
+        self: &Arc<Self>,
+        update: &MetadataUpdate,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        upload_queue.latest_metadata.apply(update);
+
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -2024,7 +2047,7 @@ mod tests {
         // Schedule upload of index. Check that it is queued
         let metadata = dummy_metadata(Lsn(0x20));
         client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3f2d807ce8..8e815ddae8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3525,7 +3525,7 @@ impl Timeline {
         &self,
         disk_consistent_lsn: Lsn,
         layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<TimelineMetadata> {
+    ) -> anyhow::Result<()> {
         // We can only save a valid 'prev_record_lsn' value on disk if we
         // flushed *all* in-memory changes to disk. We only track
         // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -3542,19 +3542,10 @@ impl Timeline {
             None
         };
 
-        let ancestor_timeline_id = self
-            .ancestor_timeline
-            .as_ref()
-            .map(|ancestor| ancestor.timeline_id);
-
-        let metadata = TimelineMetadata::new(
+        let update = crate::tenant::metadata::MetadataUpdate::new(
             disk_consistent_lsn,
             ondisk_prev_record_lsn,
-            ancestor_timeline_id,
-            self.ancestor_lsn,
             *self.latest_gc_cutoff_lsn.read(),
-            self.initdb_lsn,
-            self.pg_version,
         );
 
         fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -3566,10 +3557,10 @@ impl Timeline {
             for layer in layers_to_upload {
                 remote_client.schedule_layer_file_upload(layer)?;
             }
-            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
+            remote_client.schedule_index_upload_for_metadata_update(&update)?;
         }
 
-        Ok(metadata)
+        Ok(())
     }
 
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {

From 47addc15f182cd1823cc4b7713117376823d281e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 22 Apr 2024 13:04:37 +0300
Subject: [PATCH 052/157] relaxation: allow using layers across timelines
 (#7453)

Before, we asserted that a layer would only be loaded by the timeline
that initially created it. Now, with the ancestor detach, we will want
to utilize remote copy as much as possible, so we will need to open
other timeline layers as our own.

Cc: #6994
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 3 +++
 pageserver/src/tenant/storage_layer/image_layer.rs | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c5b5e5c98f..a4b2b4f840 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -728,6 +728,9 @@ impl DeltaLayerInner {
             // production code path
             expected_summary.index_start_blk = actual_summary.index_start_blk;
             expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
+
             if actual_summary != expected_summary {
                 bail!(
                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 5b44d2bc2c..6f46a0203b 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -396,6 +396,8 @@ impl ImageLayerInner {
             // production code path
             expected_summary.index_start_blk = actual_summary.index_start_blk;
             expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
 
             if actual_summary != expected_summary {
                 bail!(

From 6a5650d40c82496ea5d3fc7b870cf0e6e130e91f Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:37:22 +0200
Subject: [PATCH 053/157] proxy: Make retries configurable and record it.
 (#7438)

## Problem

Currently we cannot configure retries, also, we don't really have
visibility of what's going on there.

## Summary of changes

* Added cli params
* Improved logging
* Decrease the number of retries: it feels like most of retries doesn't
help. Once there would be better errors handling, we can increase it
back.
---
 proxy/src/bin/proxy.rs             | 10 ++++
 proxy/src/config.rs                | 55 ++++++++++++++++++
 proxy/src/metrics.rs               | 17 ++++++
 proxy/src/proxy.rs                 |  2 +
 proxy/src/proxy/connect_compute.rs | 40 ++++++++++++--
 proxy/src/proxy/retry.rs           | 18 +++---
 proxy/src/proxy/tests.rs           | 89 +++++++++++++++++++++++-------
 proxy/src/proxy/wake_compute.rs    | 36 ++++++++++--
 proxy/src/serverless/backend.rs    |  2 +
 9 files changed, 226 insertions(+), 43 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b54f8c131c..7df320fd42 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -200,6 +200,12 @@ struct ProxyCliArgs {
     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     #[clap(long, default_value = "4194304")]
     metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -584,6 +590,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
         aws_region: args.aws_region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_to_compute_retry_config: config::RetryConfig::parse(
+            &args.connect_to_compute_retry,
+        )?,
     }));
 
     Ok(config)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f9519c7645..ae7606e5d4 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -33,6 +33,8 @@ pub struct ProxyConfig {
     pub region: String,
     pub handshake_timeout: Duration,
     pub aws_region: String,
+    pub wake_compute_retry_config: RetryConfig,
+    pub connect_to_compute_retry_config: RetryConfig,
 }
 
 #[derive(Debug)]
@@ -517,6 +519,59 @@ impl FromStr for ProjectInfoCacheOptions {
     }
 }
 
+/// This is a config for connect to compute and wake compute.
+#[derive(Clone, Copy, Debug)]
+pub struct RetryConfig {
+    /// Number of times we should retry.
+    pub max_retries: u32,
+    /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
+    pub base_delay: tokio::time::Duration,
+    /// Exponential base for retry wait duration
+    pub backoff_factor: f64,
+}
+
+impl RetryConfig {
+    /// Default options for RetryConfig.
+
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Cplane has timeout of 60s on each request.
+    pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+
+    /// Parse retry options passed via cmdline.
+    /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
+    pub fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut num_retries = None;
+        let mut base_retry_wait_duration = None;
+        let mut retry_wait_exponent_base = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "num_retries" => num_retries = Some(value.parse()?),
+                "base_retry_wait_duration" => {
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                }
+                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        Ok(Self {
+            max_retries: num_retries.context("missing `num_retries`")?,
+            base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
+            backoff_factor: retry_wait_exponent_base
+                .context("missing `retry_wait_exponent_base`")?,
+        })
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct WakeComputeLockOptions {
     /// The number of shards the lock map should have
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 3a4e54aea0..530350008c 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -119,6 +119,10 @@ pub struct ProxyMetrics {
 
     /// Number of invalid endpoints (per protocol, per rejected).
     pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
+
+    /// Number of retries (per outcome, per retry_type).
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
+    pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
 }
 
 #[derive(MetricGroup)]
@@ -480,3 +484,16 @@ pub struct InvalidEndpointsGroup {
     pub rejected: Bool,
     pub outcome: ConnectOutcome,
 }
+
+#[derive(LabelGroup)]
+#[label(set = RetriesMetricSet)]
+pub struct RetriesMetricGroup {
+    pub outcome: ConnectOutcome,
+    pub retry_type: RetryType,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum RetryType {
+    WakeCompute,
+    ConnectToCompute,
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 4321bad968..a4554eef38 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -308,6 +308,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         &TcpMechanism { params: &params },
         &user_info,
         mode.allow_self_signed_compute(config),
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 33f394c550..8a220aaa0c 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,10 +1,11 @@
 use crate::{
     auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
+    config::RetryConfig,
     console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::{ConnectionFailureKind, Metrics},
+    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
     proxy::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
@@ -93,19 +94,23 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
+    wake_compute_retry_config: RetryConfig,
+    connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
     let mut num_retries = 0;
-    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    let mut node_info =
+        wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
     if let Some(keys) = user_info.get_keys() {
         node_info.set_keys(keys);
     }
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
+    let retry_type = RetryType::ConnectToCompute;
 
     // try once
     let err = match mechanism
@@ -114,6 +119,13 @@ where
     {
         Ok(res) => {
             ctx.latency_timer.success();
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Success,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
             return Ok(res);
         }
         Err(e) => e,
@@ -124,7 +136,7 @@ where
     let node_info = if !node_info.cached() {
         // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
-        if !err.should_retry(num_retries) {
+        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
             return Err(err.into());
         }
         node_info
@@ -132,7 +144,8 @@ where
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
         info!("compute node's state has likely changed; requesting a wake-up");
         let old_node_info = invalidate_cache(node_info);
-        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        let mut node_info =
+            wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
         node_info.reuse_settings(old_node_info);
 
         mechanism.update_connect_config(&mut node_info.config);
@@ -151,19 +164,34 @@ where
         {
             Ok(res) => {
                 ctx.latency_timer.success();
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    num_retries.into(),
+                );
+                info!(?num_retries, "connected to compute node after");
                 return Ok(res);
             }
             Err(e) => {
-                let retriable = e.should_retry(num_retries);
+                let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
                 if !retriable {
                     error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    Metrics::get().proxy.retries_metric.observe(
+                        RetriesMetricGroup {
+                            outcome: ConnectOutcome::Failed,
+                            retry_type,
+                        },
+                        num_retries.into(),
+                    );
                     return Err(e.into());
                 }
                 warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
             }
         }
 
-        let wait_duration = retry_after(num_retries);
+        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
         time::sleep(wait_duration).await;
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index a85ed380b0..082e06caa3 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,18 +1,12 @@
-use crate::compute;
+use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;
 
-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
-
 pub trait ShouldRetry {
     fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
+    fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
         match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            _ if num_retries >= config.max_retries => false,
             err => err.could_retry(),
         }
     }
@@ -63,6 +57,8 @@ impl ShouldRetry for compute::ConnectionError {
     }
 }
 
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+    config
+        .base_delay
+        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 849e9bd33c..e0ec90cb44 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -10,13 +10,13 @@ use super::*;
 use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
-use crate::config::CertResolver;
+use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
 use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
+use crate::proxy::retry::retry_after;
 use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
@@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
     let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..NUM_RETRIES_CONNECT {
-        total_wait += retry_after(num_retries);
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    for num_retries in 1..config.max_retries {
+        total_wait += retry_after(num_retries, config);
     }
-    assert!(total_wait < tokio::time::Duration::from_secs(12));
-    assert!(total_wait > tokio::time::Duration::from_secs(10));
+    assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1);
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -549,7 +553,12 @@ async fn connect_to_compute_success() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -562,7 +571,12 @@ async fn connect_to_compute_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -576,7 +590,12 @@ async fn connect_to_compute_non_retry_1() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -590,7 +609,12 @@ async fn connect_to_compute_non_retry_2() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -600,17 +624,32 @@ async fn connect_to_compute_non_retry_2() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
-    assert_eq!(NUM_RETRIES_CONNECT, 16);
+    tokio::time::pause();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![
-        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
-    ]);
+    let mechanism =
+        TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
+    let wake_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 1,
+        backoff_factor: 2.0,
+    };
+    let connect_to_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(
+        &mut ctx,
+        &mechanism,
+        &user_info,
+        false,
+        wake_compute_retry_config,
+        connect_to_compute_retry_config,
+    )
+    .await
+    .unwrap_err();
     mechanism.verify();
 }
 
@@ -622,7 +661,12 @@ async fn wake_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -636,7 +680,12 @@ async fn wake_non_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index fe228ab33d..cfedf0e98a 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,10 +1,14 @@
+use crate::config::RetryConfig;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
+    WakeupFailureKind,
+};
 use crate::proxy::retry::retry_after;
 use hyper::StatusCode;
 use std::ops::ControlFlow;
-use tracing::{error, warn};
+use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
@@ -13,23 +17,42 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
     api: &B,
+    config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
+    let retry_type = RetryType::WakeCompute;
     loop {
         let wake_res = api.wake_compute(ctx).await;
-        match handle_try_wake(wake_res, *num_retries) {
+        match handle_try_wake(wake_res, *num_retries, config) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
                 report_error(&e, false);
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Failed,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
                 return Err(e);
             }
             Ok(ControlFlow::Continue(e)) => {
                 warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
                 report_error(&e, true);
             }
-            Ok(ControlFlow::Break(n)) => return Ok(n),
+            Ok(ControlFlow::Break(n)) => {
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
+                info!(?num_retries, "compute node woken up after");
+                return Ok(n);
+            }
         }
 
-        let wait_duration = retry_after(*num_retries);
+        let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
         tokio::time::sleep(wait_duration).await;
     }
@@ -42,10 +65,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 pub fn handle_try_wake(
     result: Result<CachedNodeInfo, WakeComputeError>,
     num_retries: u32,
+    config: RetryConfig,
 ) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
     match result {
         Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
                 Ok(ControlFlow::Continue(err))
             }
             _ => Err(err),
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index e74c63599a..b91c0e62ed 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -108,6 +108,8 @@ impl PoolingBackend {
             },
             &backend,
             false, // do not allow self signed compute for http flow
+            self.config.wake_compute_retry_config,
+            self.config.connect_to_compute_retry_config,
         )
         .await
     }

From 0bd16182f7b2e7abedbb218238d83928f67607bc Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Apr 2024 12:47:24 +0100
Subject: [PATCH 054/157] pageserver: fix unlogged relations with sharding
 (#7454)

## Problem

- #7451

INIT_FORKNUM blocks must be stored on shard 0 to enable including them
in basebackup.

This issue can be missed in simple tests because creating an unlogged
table isn't sufficient -- to repro I had to create an _index_ on an
unlogged table (then restart the endpoint).

Closes: #7451

## Summary of changes

- Add a reproducer for the issue.
- Tweak the condition for `key_is_shard0` to include anything that isn't
a normal relation block _and_ any normal relation block whose forknum is
INIT_FORKNUM.
- To enable existing databases to recover from the issue, add a special
case that omits relations if they were stored on the wrong INITFORK.
This enables postgres to start and the user to drop the table and
recreate it.
---
 libs/pageserver_api/src/shard.rs     | 27 +++++++++++++++++-
 pageserver/src/basebackup.rs         | 17 +++++++++--
 test_runner/regress/test_sharding.py | 42 ++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index c293ad705b..6a8a5cc8f3 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,6 +5,7 @@ use crate::{
     models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 
@@ -537,6 +538,24 @@ impl ShardIdentity {
         }
     }
 
+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
@@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0479d05f8f..107758f385 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                 if rel.forknum == INIT_FORKNUM {
                     // I doubt we need _init fork itself, but having it at least
                     // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                     self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                     continue;
                 }
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bfaab9125f..101d2620b0 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
         max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
         diff = max_lsn - min_lsn
         assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
+
+
+def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that an unlogged relation is handled properly on a sharded tenant
+
+    Reproducer for https://github.com/neondatabase/neon/issues/7451
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
+
+    # We will create many tables to ensure it's overwhelmingly likely that at least one
+    # of them doesn't land on shard 0
+    table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(1, "foo")]
+            ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
+
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            # Check that table works: we can select and insert
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == []
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(2, "bar")]
+
+        # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)

From 139d1346d5aed41e1cf1479343943f9bf3670794 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Apr 2024 14:55:17 +0200
Subject: [PATCH 055/157] pagectl draw-timeline-dir: include layer file name as
 an SVG comment (#7455)

fixes https://github.com/neondatabase/neon/issues/7452

Also, drive-by improve the usage instructions with commands I found
useful during that incident.

The patch in the fork of `svg_fmt` is [being
upstreamed](https://github.com/nical/rust_debug/pull/4), but, in the
meantime,
let's commit what we have because it was useful during the incident.
---
 Cargo.lock                              |  3 +-
 Cargo.toml                              |  3 +-
 pageserver/ctl/src/draw_timeline_dir.rs | 73 ++++++++++++++++++++-----
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 76183bdaab..cff07239e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5830,8 +5830,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 8310d2d522..677eaa9ce4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -157,7 +157,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 0e77ef0563..9a556cb3d4 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
+//! sources; see the Usage section below for examples.
+//!
+//! # Usage
+//!
+//! ## Producing the SVG
+//!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//! $ firefox out.svg
+//!
+//! # local timeline dir
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!
+//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
+//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
+//!
+//! # From an `index_part.json` in S3
+//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
+//!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
+//! ## Viewing
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//! **Inkscape** is better than the built-in viewers in browsers.
+//!
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
+//! to see the layer file name in the comment field.
+//!
+//! ```bash
+//!
+//! # Linux
+//! inkscape out.svg
+//!
+//! # macOS
+//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
+//!
+//! ```
+//!
+
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
 
 pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
+        filename: String,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    }
+    let mut files: Vec<Layer> = vec![];
     let stdin = io::stdin();
     for line in stdin.lock().lines() {
         let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
             // Don't try and parse "metadata" like a key-lsn range
             continue;
         }
-        let range = parse_filename(filename);
-        ranges.push(range);
+        let (key_range, lsn_range) = parse_filename(filename);
+        files.push(Layer {
+            filename: filename.to_owned(),
+            key_range,
+            lsn_range,
+        });
     }
 
     // Collect all coordinates
     let mut keys: Vec<Key> = vec![];
     let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        key_range: keyr,
+        lsn_range: lsnr,
+        ..
+    } in &files
+    {
         keys.push(keyr.start);
         keys.push(keyr.end);
         lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
             h: stretch * lsn_map.len() as f32
         }
     );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        filename,
+        key_range: keyr,
+        lsn_range: lsnr,
+    } in &files
+    {
         let key_start = *key_map.get(&keyr.start).unwrap();
         let key_end = *key_map.get(&keyr.end).unwrap();
         let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
             .fill(fill)
             .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
             .border_radius(0.4)
+            .comment(filename)
         );
     }
     println!("{}", EndSvg);

From 25d9dc6eaf9803675bd694a6d5f107947c8c24aa Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 22 Apr 2024 15:40:35 +0100
Subject: [PATCH 056/157] chore(pageserver): separate missing key error (#7393)

As part of https://github.com/neondatabase/neon/pull/7375 and to improve
the current vectored get implementation, we separate the missing key
error out. This also saves us several Box allocations in the get page
implementation.

## Summary of changes

* Create a caching field of layer traversal id for each of the layer.
* Remove box allocations for layer traversal id retrieval and implement
MissingKey error message as before. This should be a little bit faster.
* Do not format error message until `Display`.
* For in-mem layer, the descriptor is different before/after frozen. I'm
using once lock for that.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 |   3 +
 pageserver/src/pgdatadir_mapping.rs           |  12 +-
 .../tenant/storage_layer/inmemory_layer.rs    |  45 ++++-
 pageserver/src/tenant/storage_layer/layer.rs  |   8 +
 .../src/tenant/storage_layer/layer/tests.rs   |   4 +-
 pageserver/src/tenant/timeline.rs             | 165 ++++++++++--------
 6 files changed, 157 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 20258dd950..81508965b4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
     fn from(pre: PageReconstructError) -> ApiError {
         match pre {
             PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::MissingKey(e) => {
+                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
+            }
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6f7d74bdee..351a766b10 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1446,10 +1446,14 @@ impl<'a> DatadirModification<'a> {
                     // reset the map.
                     return Err(e.into());
                 }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
-                // other error occurs, we may incorrectly reset the map of aux files.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
+                Err(
+                    PageReconstructError::Other(_)
+                    | PageReconstructError::WalRedo(_)
+                    | PageReconstructError::MissingKey { .. },
+                ) => {
                     // Key is missing, we must insert an image as the basis for subsequent deltas.
 
                     let mut dir = AuxFilesDirectory {
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 29751641b4..a86d0d48c5 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -54,6 +54,12 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     end_lsn: OnceLock<Lsn>,
 
+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
+    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
+    frozen_local_path_str: OnceLock<Arc<str>>,
+
     opened_at: Instant,
 
     /// The above fields never change, except for `end_lsn`, which is only set once.
@@ -241,6 +247,12 @@ impl InMemoryLayer {
         self.start_lsn..self.end_lsn_or_max()
     }
 
+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
@@ -430,10 +442,24 @@ impl InMemoryLayer {
     }
 }
 
+fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
+    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
+}
+
+fn inmem_layer_log_display(
+    mut f: impl Write,
+    timeline: TimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+) -> std::fmt::Result {
+    write!(f, "timeline {} in-memory ", timeline)?;
+    inmem_layer_display(f, start_lsn, end_lsn)
+}
+
 impl std::fmt::Display for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let end_lsn = self.end_lsn_or_max();
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+        inmem_layer_display(f, self.start_lsn, end_lsn)
     }
 }
 
@@ -458,6 +484,12 @@ impl InMemoryLayer {
 
         Ok(InMemoryLayer {
             file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
+            frozen_local_path_str: OnceLock::new(),
             conf,
             timeline_id,
             tenant_shard_id,
@@ -552,6 +584,15 @@ impl InMemoryLayer {
         );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
+        self.frozen_local_path_str
+            .set({
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
+                    .unwrap();
+                buf.into()
+            })
+            .expect("frozen_local_path_str set only once");
+
         for vec_map in inner.index.values() {
             for (lsn, _pos) in vec_map.as_slice() {
                 assert!(*lsn < end_lsn);
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index e55299f0fa..316a11f8cc 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -395,6 +395,10 @@ impl Layer {
         &self.0.path
     }
 
+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        &self.0.path_str
+    }
+
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.0.metadata()
     }
@@ -517,6 +521,9 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
+    /// String representation of the full path, used for traversal id.
+    path_str: Arc<str>,
+
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
@@ -722,6 +729,7 @@ impl LayerInner {
 
         LayerInner {
             conf,
+            path_str: path.to_string().into(),
             path,
             desc,
             timeline: Arc::downgrade(timeline),
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index f0697fdf28..52f62faa8d 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -818,11 +818,13 @@ async fn eviction_cancellation_on_drop() {
     }
 }
 
+/// A test case to remind you the cost of these structures. You can bump the size limit
+/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
     assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
     assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8e815ddae8..e707c3b244 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use pageserver_api::{
         EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
     },
     reltag::BlockNumber,
-    shard::{ShardIdentity, TenantShardId},
+    shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -428,6 +428,62 @@ pub(crate) enum PageReconstructError {
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(anyhow::Error),
+
+    #[error("{0}")]
+    MissingKey(MissingKeyError),
+}
+
+#[derive(Debug)]
+pub struct MissingKeyError {
+    stuck_at_lsn: bool,
+    key: Key,
+    shard: ShardNumber,
+    cont_lsn: Lsn,
+    request_lsn: Lsn,
+    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
+    backtrace: Option<std::backtrace::Backtrace>,
+}
+
+impl std::fmt::Display for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.stuck_at_lsn {
+            // Records are found in this timeline but no image layer or initial delta record was found.
+            write!(
+                f,
+                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+                write!(f, ", ancestor {}", ancestor_lsn)?;
+            }
+        } else {
+            // No records in this timeline.
+            write!(
+                f,
+                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+        }
+
+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
+        if let Some(ref backtrace) = self.backtrace {
+            write!(f, "\n{}", backtrace)?;
+        }
+
+        Ok(())
+    }
 }
 
 impl PageReconstructError {
@@ -439,6 +495,7 @@ impl PageReconstructError {
             AncestorLsnTimeout(_) => false,
             Cancelled | AncestorStopping(_) => true,
             WalRedo(_) => false,
+            MissingKey { .. } => false,
         }
     }
 }
@@ -753,7 +810,7 @@ impl Timeline {
                 writeln!(
                     msg,
                     "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
+                    layer,
                 )
                 .expect("string grows")
             });
@@ -872,9 +929,11 @@ impl Timeline {
                     Err(Cancelled | AncestorStopping(_)) => {
                         return Err(GetVectoredError::Cancelled)
                     }
-                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
-                        return Err(GetVectoredError::MissingKey(key))
-                    }
+                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
+                    Err(MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        ..
+                    })) => return Err(GetVectoredError::MissingKey(key)),
                     _ => {
                         values.insert(key, block);
                         key = key.next();
@@ -2692,7 +2751,7 @@ impl Timeline {
     }
 }
 
-type TraversalId = String;
+type TraversalId = Arc<str>;
 
 trait TraversalLayerExt {
     fn traversal_id(&self) -> TraversalId;
@@ -2700,13 +2759,13 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Layer {
     fn traversal_id(&self) -> TraversalId {
-        self.local_path().to_string()
+        Arc::clone(self.local_path_str())
     }
 }
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
     fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        Arc::clone(self.local_path_str())
     }
 }
 
@@ -2775,32 +2834,35 @@ impl Timeline {
                         if prev <= cont_lsn {
                             // Didn't make any progress in last iteration. Error out to avoid
                             // getting stuck in the loop.
-                            return Err(layer_traversal_error(format!(
-                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: true,
                                 key,
-                                Lsn(cont_lsn.0 - 1),
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
                                 request_lsn,
-                                timeline.ancestor_lsn
-                            ), traversal_path));
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
                         }
                     }
                     prev_lsn = Some(cont_lsn);
                 }
                 ValueReconstructResult::Missing => {
-                    return Err(layer_traversal_error(
-                        if cfg!(test) {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
-                            )
-                        } else {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
-                            )
-                        },
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
                         traversal_path,
-                    ));
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
                 }
             }
 
@@ -2848,11 +2910,7 @@ impl Timeline {
                     };
                     cont_lsn = lsn_floor;
                     // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
-                    ));
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -2879,11 +2937,7 @@ impl Timeline {
                     };
                     cont_lsn = lsn_floor;
                     // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
-                    ));
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -2904,14 +2958,7 @@ impl Timeline {
                 };
                 cont_lsn = lsn_floor;
                 *read_count += 1;
-                traversal_path.push((
-                    result,
-                    cont_lsn,
-                    Box::new({
-                        let layer = layer.to_owned();
-                        move || layer.traversal_id()
-                    }),
-                ));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
                 continue 'outer;
             } else if timeline.ancestor_timeline.is_some() {
                 // Nothing on this timeline. Traverse to parent
@@ -4656,35 +4703,7 @@ impl Timeline {
     }
 }
 
-type TraversalPathItem = (
-    ValueReconstructResult,
-    Lsn,
-    Box<dyn Send + FnOnce() -> TraversalId>,
-);
-
-/// Helper function for get_reconstruct_data() to add the path of layers traversed
-/// to an error, as anyhow context information.
-fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
-    // We want the original 'msg' to be the outermost context. The outermost context
-    // is the most high-level information, which also gets propagated to the client.
-    let mut msg_iter = path
-        .into_iter()
-        .map(|(r, c, l)| {
-            format!(
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l(),
-            )
-        })
-        .chain(std::iter::once(msg));
-    // Construct initial message from the first traversed layer
-    let err = anyhow!(msg_iter.next().unwrap());
-
-    // Append all subsequent traversals, and the error message 'msg', as contexts.
-    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    PageReconstructError::from(msg)
-}
+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
 
 struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,

From e69ff3fc00ab8be31e8f69eb3726da1b83d84180 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 22 Apr 2024 19:40:08 +0300
Subject: [PATCH 057/157] Refactor updating relation size cache on reads
 (#7376)

Instead of trusting that a request with latest == true means that the
request LSN was at least last_record_lsn, remember explicitly when the
relation cache was initialized.

Incidentally, this allows updating the relation size cache also on reads
from read-only endpoints, when the endpoint is at a relatively recent
LSN (more recent than the end of the timeline when the timeline was
loaded in the pageserver).

Add a comment to wait_or_get_last_lsn() that it might be better to use
an older LSN when possible. Note that doing that would be unsafe,
without the relation cache changes in this commit!
---
 pageserver/src/page_service.rs      |  5 +++++
 pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++++--------------
 pageserver/src/tenant/timeline.rs   | 17 +++++++++++++++--
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 3b9a30ba4c..62782d8dd3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -874,6 +874,11 @@ impl PageServerHandler {
             // walsender completes the authentication and starts streaming the
             // WAL.
             if lsn <= last_record_lsn {
+                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
+                // last_record_lsn. That would give the same result, since we know
+                // that there haven't been modifications since 'lsn'. Using an older
+                // LSN might be faster, because that could allow skipping recent
+                // layers when finding the page.
                 lsn = last_record_lsn;
             } else {
                 timeline
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 351a766b10..4a9682dcac 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -252,16 +252,8 @@ impl Timeline {
         let mut buf = version.get(self, key, ctx).await?;
         let nblocks = buf.get_u32_le();
 
-        if latest {
-            // Update relation size cache only if "latest" flag is set.
-            // This flag is set by compute when it is working with most recent version of relation.
-            // Typically master compute node always set latest=true.
-            // Please notice, that even if compute node "by mistake" specifies old LSN but set
-            // latest=true, then it can not cause cache corruption, because with latest=true
-            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
-            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-        }
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+
         Ok(nblocks)
     }
 
@@ -817,7 +809,7 @@ impl Timeline {
     /// Get cached size of relation if it not updated after specified LSN
     pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
         let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
             if lsn >= *cached_lsn {
                 return Some(*nblocks);
             }
@@ -828,7 +820,16 @@ impl Timeline {
     /// Update cached relation size if there is no more recent update
     pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        match rel_size_cache.entry(tag) {
+
+        if lsn < rel_size_cache.complete_as_of {
+            // Do not cache old values. It's safe to cache the size on read, as long as
+            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
+            // never evict values from the cache, so if the relation size changed after
+            // 'lsn', the new value is already in the cache.
+            return;
+        }
+
+        match rel_size_cache.map.entry(tag) {
             hash_map::Entry::Occupied(mut entry) => {
                 let cached_lsn = entry.get_mut();
                 if lsn >= cached_lsn.0 {
@@ -844,13 +845,13 @@ impl Timeline {
     /// Store cached relation size
     pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.insert(tag, (lsn, nblocks));
+        rel_size_cache.map.insert(tag, (lsn, nblocks));
     }
 
     /// Remove cached relation size
     pub fn remove_cached_rel_size(&self, tag: &RelTag) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.remove(tag);
+        rel_size_cache.map.remove(tag);
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e707c3b244..fa7d219fb0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -182,6 +182,16 @@ pub(crate) struct AuxFilesState {
     pub(crate) n_deltas: usize,
 }
 
+/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
+/// ingestion considerably, because WAL ingestion needs to check on most records if the record
+/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
+/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
+/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
+pub(crate) struct RelSizeCache {
+    pub(crate) complete_as_of: Lsn,
+    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -324,7 +334,7 @@ pub struct Timeline {
     pub walreceiver: Mutex<Option<WalReceiver>>,
 
     /// Relation size cache
-    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_cache: RwLock<RelSizeCache>,
 
     download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
 
@@ -1951,7 +1961,10 @@ impl Timeline {
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
 
                 last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(HashMap::new()),
+                rel_size_cache: RwLock::new(RelSizeCache {
+                    complete_as_of: disk_consistent_lsn,
+                    map: HashMap::new(),
+                }),
 
                 download_all_remote_layers_task_info: RwLock::new(None),
 

From d551bfee091abed46152f26c06e86a216ab8ac08 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Apr 2024 11:36:56 +0100
Subject: [PATCH 058/157] pageserver: remove import/export script previously
 used for breaking format changes (#7458)

## Problem
The `export_import_between_pageservers` script us to do major storage format changes
in the past. If we have to do such breaking changes in the future this approach
wouldn't be suitable because:
1. It doesn't scale to the current size of the fleet
2. It loses history

## Summary of changes
Remove the script and its associated test.
Keep `fullbasebackup` and friends because it's useful for debugging.

Closes https://github.com/neondatabase/cloud/issues/11648
---
 pageserver/src/page_service.rs                |   4 +
 scripts/export_import_between_pageservers.py  | 730 ------------------
 test_runner/regress/test_tenant_relocation.py |  49 +-
 3 files changed, 8 insertions(+), 775 deletions(-)
 delete mode 100755 scripts/export_import_between_pageservers.py

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 62782d8dd3..fa6b81ac72 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1206,6 +1206,10 @@ impl PageServerHandler {
         ))
     }
 
+    /// Note on "fullbackup":
+    /// Full basebackups should only be used for debugging purposes.
+    /// Originally, it was introduced to enable breaking storage format changes,
+    /// but that is not applicable anymore.
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
deleted file mode 100755
index 84b69cb36a..0000000000
--- a/scripts/export_import_between_pageservers.py
+++ /dev/null
@@ -1,730 +0,0 @@
-#
-# Script to export tenants from one pageserver and import them into another page server.
-#
-# Outline of steps:
-# 1. Get `(last_lsn, prev_lsn)` from old pageserver
-# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
-# 3. This tar file might be missing relation files for empty relations, if the pageserver
-#    is old enough (we didn't always store those). So to recreate them, we start a local
-#    vanilla postgres on this basebackup and ask it what relations should exist, then touch
-#    any missing files and re-pack the tar.
-#    TODO This functionality is no longer needed, so we can delete it later if we don't
-#         end up using the same utils for the pg 15 upgrade. Not sure.
-# 4. We import the patched basebackup into a new pageserver
-# 5. We export again via fullbackup, now from the new pageserver and compare the returned
-#    tar file with the one we imported. This confirms that we imported everything that was
-#    exported, but doesn't guarantee correctness (what if we didn't **export** everything
-#    initially?)
-# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
-#
-# For more context on how to use this, see:
-# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf
-
-import argparse
-import os
-import shutil
-import subprocess
-import tempfile
-import time
-import uuid
-from contextlib import closing
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, cast
-
-import psycopg2
-import requests
-from psycopg2.extensions import connection as PgConnection
-from psycopg2.extensions import parse_dsn
-
-###############################################
-### client-side utils copied from test fixtures
-###############################################
-
-Env = Dict[str, str]
-
-_global_counter = 0
-
-
-def global_counter() -> int:
-    """A really dumb global counter.
-    This is useful for giving output files a unique number, so if we run the
-    same command multiple times we can keep their output separate.
-    """
-    global _global_counter
-    _global_counter += 1
-    return _global_counter
-
-
-def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
-    """Run a process and capture its output
-    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
-    where "cmd" is the name of the program and NNN is an incrementing
-    counter.
-    If those files already exist, we will overwrite them.
-    Returns basepath for files with captured output.
-    """
-    assert isinstance(cmd, list)
-    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
-    basepath = os.path.join(capture_dir, base)
-    stdout_filename = basepath + ".stdout"
-    stderr_filename = basepath + ".stderr"
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f'(capturing output to "{base}.stdout")')
-            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
-
-    return basepath
-
-
-class PgBin:
-    """A helper class for executing postgres binaries"""
-
-    def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
-        self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
-        self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
-
-    def _fixpath(self, command: List[str]):
-        if "/" not in command[0]:
-            command[0] = os.path.join(self.pg_bin_path, command[0])
-
-    def _build_env(self, env_add: Optional[Env]) -> Env:
-        if env_add is None:
-            return self.env
-        env = self.env.copy()
-        env.update(env_add)
-        return env
-
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
-        """
-        Run one of the postgres binaries.
-        The command should be in list form, e.g. ['pgbench', '-p', '55432']
-        All the necessary environment variables will be set.
-        If the first argument (the command name) doesn't include a path (no '/'
-        characters present), then it will be edited to include the correct path.
-        If you want stdout/stderr captured to files, use `run_capture` instead.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
-
-    def run_capture(
-        self,
-        command: List[str],
-        env: Optional[Env] = None,
-        cwd: Optional[str] = None,
-        **kwargs: Any,
-    ) -> str:
-        """
-        Run one of the postgres binaries, with stderr and stdout redirected to a file.
-        This is just like `run`, but for chatty programs. Returns basepath for files
-        with captured output.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        return subprocess_capture(
-            str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
-        )
-
-
-class PgProtocol:
-    """Reusable connection logic"""
-
-    def __init__(self, **kwargs):
-        self.default_options = kwargs
-
-    def conn_options(self, **kwargs):
-        conn_options = self.default_options.copy()
-        if "dsn" in kwargs:
-            conn_options.update(parse_dsn(kwargs["dsn"]))
-        conn_options.update(kwargs)
-
-        # Individual statement timeout in seconds. 2 minutes should be
-        # enough for our tests, but if you need a longer, you can
-        # change it by calling "SET statement_timeout" after
-        # connecting.
-        conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
-
-        return conn_options
-
-    # autocommit=True here by default because that's what we need most of the time
-    def connect(self, autocommit=True, **kwargs) -> PgConnection:
-        """
-        Connect to the node.
-        Returns psycopg2's connection object.
-        This method passes all extra params to connstr.
-        """
-        conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
-
-        # WARNING: this setting affects *all* tests!
-        conn.autocommit = autocommit
-        return conn
-
-    def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
-        """
-        Execute query against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        return self.safe_psql_many([query], **kwargs)[0]
-
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
-        """
-        Execute queries against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        result: List[List[Any]] = []
-        with closing(self.connect(**kwargs)) as conn:
-            with conn.cursor() as cur:
-                for query in queries:
-                    print(f"Executing query: {query}")
-                    cur.execute(query)
-
-                    if cur.description is None:
-                        result.append([])  # query didn't return data
-                    else:
-                        result.append(cast(List[Any], cur.fetchall()))
-        return result
-
-
-class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
-        super().__init__(host="localhost", port=port, dbname="postgres")
-        self.pgdatadir = pgdatadir
-        self.pg_bin = pg_bin
-        self.running = False
-        if init:
-            self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
-        self.configure([f"port = {port}\n"])
-
-    def configure(self, options: List[str]):
-        """Append lines into postgresql.conf file."""
-        assert not self.running
-        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
-            conf_file.write("\n".join(options))
-
-    def start(self, log_path: Optional[str] = None):
-        assert not self.running
-        self.running = True
-
-        log_path = log_path or os.path.join(self.pgdatadir, "pg.log")
-
-        self.pg_bin.run_capture(
-            ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
-        )
-
-    def stop(self):
-        assert self.running
-        self.running = False
-        self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc, tb):
-        if self.running:
-            self.stop()
-
-
-class NeonPageserverApiException(Exception):
-    pass
-
-
-class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, host, port):
-        super().__init__()
-        self.host = host
-        self.port = port
-
-    def verbose_error(self, res: requests.Response):
-        try:
-            res.raise_for_status()
-        except requests.RequestException as e:
-            try:
-                msg = res.json()["msg"]
-            except:  # noqa: E722
-                msg = ""
-            raise NeonPageserverApiException(msg) from e
-
-    def check_status(self):
-        self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
-
-    def tenant_list(self):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
-        res = self.post(
-            f"http://{self.host}:{self.port}/v1/tenant",
-            json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
-        )
-
-        if res.status_code == 409:
-            if ok_if_exists:
-                print(f"could not create tenant: already exists for id {new_tenant_id}")
-            else:
-                res.raise_for_status()
-        elif res.status_code == 201:
-            print(f"created tenant {new_tenant_id}")
-        else:
-            self.verbose_error(res)
-
-        return new_tenant_id
-
-    def timeline_list(self, tenant_id: uuid.UUID):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true"
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-
-def lsn_to_hex(num: int) -> str:
-    """Convert lsn from int to standard hex notation."""
-    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
-
-
-def lsn_from_hex(lsn_hex: str) -> int:
-    """Convert lsn from hex notation to int."""
-    left, right = lsn_hex.split("/")
-    return (int(left, 16) << 32) + int(right, 16)
-
-
-def remote_consistent_lsn(
-    pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID
-) -> int:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    lsn_str = detail["remote_consistent_lsn"]
-    assert isinstance(lsn_str, str)
-    return lsn_from_hex(lsn_str)
-
-
-def wait_for_upload(
-    pageserver_http_client: NeonPageserverHttpClient,
-    tenant: uuid.UUID,
-    timeline: uuid.UUID,
-    lsn: int,
-):
-    """waits for local timeline upload up to specified lsn"""
-    for i in range(10):
-        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            return
-        print(
-            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
-        )
-        time.sleep(1)
-
-    raise Exception(
-        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
-    )
-
-
-##############
-# End of utils
-##############
-
-
-def pack_base(log_dir, restored_dir, output_tar):
-    """Create tar file from basebackup, being careful to produce relative filenames."""
-    tmp_tar_name = "tmp.tar"
-    tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
-    cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
-    # We actually cd into the dir and call tar from there. If we call tar from
-    # outside we won't encode filenames as relative, and they won't parse well
-    # on import.
-    subprocess_capture(log_dir, cmd, cwd=restored_dir)
-    shutil.move(tmp_tar_path, output_tar)
-
-
-def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
-    """Reconstruct what relation files should exist in the datadir by querying postgres."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
-
-        # Start a vanilla postgres from the given datadir and query it to find
-        # what relfiles should exist, but possibly don't.
-        with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg:
-            vanilla_pg.configure([f"port={port}"])
-            vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
-
-            # Create database based on template0 because we can't connect to template0
-            query = "create database template0copy template template0"
-            vanilla_pg.safe_psql(query, user="cloud_admin")
-            vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
-
-            # Get all databases
-            query = "select oid, datname from pg_database"
-            oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
-            template0_oid = [
-                oid for (oid, database) in oid_dbname_pairs if database == "template0"
-            ][0]
-
-            # Get rel paths for each database
-            for oid, database in oid_dbname_pairs:
-                if database == "template0":
-                    # We can't connect to template0
-                    continue
-
-                query = "select relname, pg_relation_filepath(oid) from pg_class"
-                result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
-                for _relname, filepath in result:
-                    if filepath is not None:
-                        if database == "template0copy":
-                            # Add all template0copy paths to template0
-                            prefix = f"base/{oid}/"
-                            if filepath.startswith(prefix):
-                                suffix = filepath[len(prefix) :]
-                                yield f"base/{template0_oid}/{suffix}"
-                            elif filepath.startswith("global"):
-                                print(f"skipping {database} global file {filepath}")
-                            else:
-                                raise AssertionError
-                        else:
-                            yield filepath
-
-
-def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
-    """Add the appropriate empty files to a basebadkup tar."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
-
-        # Touch files that don't exist
-        for path in paths:
-            absolute_path = os.path.join(restored_dir, path)
-            exists = os.path.exists(absolute_path)
-            if not exists:
-                print(f"File {absolute_path} didn't exist. Creating..")
-                Path(absolute_path).touch()
-
-        # Repackage
-        pack_base(log_dir, restored_dir, output_tar)
-
-
-# HACK This is a workaround for exporting from old pageservers that
-#      can't export empty relations. In this case we need to start
-#      a vanilla postgres from the exported datadir, and query it
-#      to see what empty relations are missing, and then create
-#      those empty files before importing.
-def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
-    reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port))
-    touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
-
-
-def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    with closing(psycopg2.connect(pageserver_connstr)) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-            cur.execute(cmd)
-            res = cur.fetchone()
-            assert res is not None
-            prev_lsn = res[0]
-            last_lsn = res[1]
-
-    return last_lsn, prev_lsn
-
-
-def import_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    pageserver_http,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Import timelines to new pageserver
-    import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}"
-    full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
-
-    stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
-    stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
-
-    print(f"Running: {full_cmd}")
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename2, "w") as stderr_f:
-            print(f"(capturing output to {stdout_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                full_cmd,
-                stdout=stdout_f,
-                stderr=stderr_f,
-                env=pg_bin._build_env(None),
-                shell=True,
-                check=True,
-            )
-
-            print("Done import")
-
-    # Wait until pageserver persists the files
-    wait_for_upload(
-        pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn)
-    )
-
-
-def export_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Choose filenames
-    incomplete_filename = tar_filename + ".incomplete"
-    stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
-
-    # Construct export command
-    query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
-    cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
-
-    # Run export command
-    print(f"Running: {cmd}")
-    with open(incomplete_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f"(capturing output to {incomplete_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
-            )
-
-    # Add missing rels
-    pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-    add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port)
-
-    # Log more info
-    file_size = os.path.getsize(tar_filename)
-    print(f"Done export: {tar_filename}, size {file_size}")
-
-
-def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
-    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
-
-    old_pageserver_host = args.old_pageserver_host
-    new_pageserver_host = args.new_pageserver_host
-
-    old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
-    old_http_client.check_status()
-    old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
-
-    new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
-    new_http_client.check_status()
-    new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
-
-    for tenant_id in args.tenants:
-        print(f"Tenant: {tenant_id}")
-        timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
-        print(f"Timelines: {timelines}")
-
-        # Create tenant in new pageserver
-        if args.only_import is False and not args.timelines:
-            new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
-
-        for timeline in timelines:
-            # Skip timelines we don't need to export
-            if args.timelines and timeline["timeline_id"] not in args.timelines:
-                print(f"Skipping timeline {timeline['timeline_id']}")
-                continue
-
-            # Choose filenames
-            tar_filename = os.path.join(
-                args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar"
-            )
-
-            pg_version = timeline["pg_version"]
-
-            # Export timeline from old pageserver
-            if args.only_import is False:
-                last_lsn, prev_lsn = get_rlsn(
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                )
-                export_timeline(
-                    args,
-                    psql_path,
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                    last_lsn,
-                    prev_lsn,
-                    tar_filename,
-                    pg_version,
-                )
-
-            # Import into new pageserver
-            import_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                new_http_client,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                tar_filename,
-                pg_version,
-            )
-
-            # Re-export and compare
-            re_export_filename = tar_filename + ".reexport"
-            export_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                re_export_filename,
-                pg_version,
-            )
-
-            # Check the size is the same
-            old_size = (os.path.getsize(tar_filename),)
-            new_size = (os.path.getsize(re_export_filename),)
-            if old_size != new_size:
-                raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
-
-
-def non_zero_tcp_port(arg: Any):
-    port = int(arg)
-    if port < 1 or port > 65535:
-        raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
-    return port
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tenant-id",
-        dest="tenants",
-        required=True,
-        nargs="+",
-        help="Id of the tenant to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--timeline-id",
-        dest="timelines",
-        required=False,
-        nargs="+",
-        help="Id of the timeline to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--from-host",
-        dest="old_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data from",
-    )
-    parser.add_argument(
-        "--from-http-port",
-        dest="old_pageserver_http_port",
-        required=False,
-        type=int,
-        default=9898,
-        help="HTTP port of the pageserver to migrate data from. Default: 9898",
-    )
-    parser.add_argument(
-        "--from-pg-port",
-        dest="old_pageserver_pg_port",
-        required=False,
-        type=int,
-        default=6400,
-        help="pg port of the pageserver to migrate data from. Default: 6400",
-    )
-    parser.add_argument(
-        "--to-host",
-        dest="new_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data to",
-    )
-    parser.add_argument(
-        "--to-http-port",
-        dest="new_pageserver_http_port",
-        required=False,
-        default=9898,
-        type=int,
-        help="HTTP port of the pageserver to migrate data to. Default: 9898",
-    )
-    parser.add_argument(
-        "--to-pg-port",
-        dest="new_pageserver_pg_port",
-        required=False,
-        default=6400,
-        type=int,
-        help="pg port of the pageserver to migrate data to. Default: 6400",
-    )
-    parser.add_argument(
-        "--ignore-tenant-exists",
-        dest="ok_if_exists",
-        required=False,
-        help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.",
-    )
-    parser.add_argument(
-        "--pg-distrib-dir",
-        dest="pg_distrib_dir",
-        required=False,
-        default="/usr/local/",
-        help="Path where postgres binaries are installed. Default: /usr/local/",
-    )
-    parser.add_argument(
-        "--psql-path",
-        dest="psql_path",
-        required=False,
-        default="/usr/local/v14/bin/psql",
-        help="Path to the psql binary. Default: /usr/local/v14/bin/psql",
-    )
-    parser.add_argument(
-        "--only-import",
-        dest="only_import",
-        required=False,
-        default=False,
-        action="store_true",
-        help="Skip export and tenant creation part",
-    )
-    parser.add_argument(
-        "--work-dir",
-        dest="work_dir",
-        required=True,
-        default=False,
-        help="directory where temporary tar files are stored",
-    )
-    parser.add_argument(
-        "--tmp-pg-port",
-        dest="tmp_pg_port",
-        required=False,
-        default=55439,
-        type=non_zero_tcp_port,
-        help="localhost port to use for temporary postgres instance",
-    )
-    args = parser.parse_args()
-    main(args)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 9def3ad1c2..68d9d9a660 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -16,7 +16,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
     wait_tenant_status_404,
 )
-from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
@@ -24,7 +23,6 @@ from fixtures.remote_storage import (
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
-    subprocess_capture,
     wait_until,
 )
 
@@ -184,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca
         # A minor migration involves no storage breaking changes.
         # It is done by attaching the tenant to a new pageserver.
         "minor",
-        # A major migration involves exporting a postgres datadir
-        # basebackup and importing it into the new pageserver.
-        # This kind of migration can tolerate breaking changes
-        # to storage format
-        "major",
+        # In the unlikely and unfortunate event that we have to break
+        # the storage format, extend this test with the param below.
+        # "major",
     ],
 )
 @pytest.mark.parametrize("with_load", ["with_load", "without_load"])
 def test_tenant_relocation(
     neon_env_builder: NeonEnvBuilder,
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-    base_dir: Path,
     method: str,
     with_load: str,
 ):
@@ -299,40 +291,7 @@ def test_tenant_relocation(
         current_lsn=current_lsn_second,
     )
 
-    # Migrate either by attaching from s3 or import/export basebackup
-    if method == "major":
-        cmd = [
-            "poetry",
-            "run",
-            "python",
-            str(base_dir / "scripts/export_import_between_pageservers.py"),
-            "--tenant-id",
-            str(tenant_id),
-            "--from-host",
-            "localhost",
-            "--from-http-port",
-            str(origin_http.port),
-            "--from-pg-port",
-            str(origin_ps.service_port.pg),
-            "--to-host",
-            "localhost",
-            "--to-http-port",
-            str(destination_http.port),
-            "--to-pg-port",
-            str(destination_ps.service_port.pg),
-            "--pg-distrib-dir",
-            str(neon_env_builder.pg_distrib_dir),
-            "--work-dir",
-            str(test_output_dir),
-            "--tmp-pg-port",
-            str(port_distributor.get_port()),
-        ]
-        subprocess_capture(test_output_dir, cmd, check=True)
-
-        destination_ps.allowed_errors.append(
-            ".*ignored .* unexpected bytes after the tar archive.*"
-        )
-    elif method == "minor":
+    if method == "minor":
         # call to attach timeline to new pageserver
         destination_ps.tenant_attach(tenant_id)
 

From fa12d6023781e3d3972e77a8cc4be58bc24dd810 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 23 Apr 2024 13:42:58 +0200
Subject: [PATCH 059/157] Don't pass tenant_id in location_config requests from
 storage controller (#7476)

Tested this locally via a simple patch, the `tenant_id` is now gone from
the json.

Follow-up of #7055, prerequisite for #7469.
---
 libs/pageserver_api/src/models.rs | 1 +
 pageserver/client/src/mgmt_api.rs | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f441d1ff1a..e334a68a1e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -429,6 +429,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 3c9982ffb8..892e6c2758 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -279,7 +279,7 @@ impl Client {
         lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
-            tenant_id: Some(tenant_shard_id),
+            tenant_id: None,
             config,
         };
 

From a9fda8c8327b39c9d543bf22c02186c279cc152a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Apr 2024 14:03:33 +0100
Subject: [PATCH 060/157] pageserver: fix vectored read aux key handling
 (#7404)

## Problem
Vectored get would descend into ancestor timelines for aux files.
This is not the behaviour of the legacy read path and blocks cutting
over to the vectored read path.

Fixes https://github.com/neondatabase/neon/issues/7379

## Summary of Changes
Treat non inherited keys specially in vectored get. At the point when
we want to descend into the ancestor mark all pending non inherited keys
as errored out at the key level. Note that this diverges from the
standard vectored get behaviour for missing keys which is a top level
error. This divergence is required to avoid blocking compaction in case
such an error is encountered when compaction aux files keys. I'm pretty
sure the bug I just described predates the vectored get implementation,
but it's still worth fixing.
---
 libs/pageserver_api/src/key.rs      |  8 ++--
 libs/pageserver_api/src/keyspace.rs | 53 ++++++++++++++++++++++++---
 pageserver/src/tenant.rs            | 57 +++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs   | 45 ++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 11 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 852670af2c..1d66dd8878 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -48,11 +48,11 @@ impl Key {
         }
     }
 
-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
         self.add(1)
     }
 
-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
         let mut key = *self;
 
         let r = key.field6.overflowing_add(x);
@@ -475,12 +475,14 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
+pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key)
 }
 
 #[inline(always)]
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 05fa4562e1..78e4a3d735 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -94,12 +94,13 @@ impl KeySpace {
 
     /// Remove all keys in `other` from `self`.
     /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
         let (self_start, self_end) = match (self.start(), self.end()) {
             (Some(start), Some(end)) => (start, end),
             _ => {
                 // self is empty
-                return;
+                return KeySpace::default();
             }
         };
 
@@ -112,30 +113,37 @@ impl KeySpace {
             .skip_while(|range| self_start >= range.end)
             .take_while(|range| self_end > range.start);
 
+        let mut removed_accum = KeySpaceRandomAccum::new();
         for range in other_ranges {
             while let Some(overlap_at) = self.overlaps_at(range) {
                 let overlapped = self.ranges[overlap_at].clone();
 
                 if overlapped.start < range.start && overlapped.end <= range.end {
                     // Higher part of the range is completely overlapped.
+                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                     self.ranges[overlap_at].end = range.start;
                 }
                 if overlapped.start >= range.start && overlapped.end > range.end {
                     // Lower part of the range is completely overlapped.
+                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                     self.ranges[overlap_at].start = range.end;
                 }
                 if overlapped.start < range.start && overlapped.end > range.end {
                     // Middle part of the range is overlapped.
+                    removed_accum.add_range(range.clone());
                     self.ranges[overlap_at].end = range.start;
                     self.ranges
                         .insert(overlap_at + 1, range.end..overlapped.end);
                 }
                 if overlapped.start >= range.start && overlapped.end <= range.end {
                     // Whole range is overlapped
+                    removed_accum.add_range(self.ranges[overlap_at].clone());
                     self.ranges.remove(overlap_at);
                 }
             }
         }
+
+        removed_accum.to_keyspace()
     }
 
     pub fn start(&self) -> Option<Key> {
@@ -553,7 +561,16 @@ mod tests {
                 Key::from_i128(11)..Key::from_i128(13),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -583,7 +600,17 @@ mod tests {
                 Key::from_i128(14)..Key::from_i128(17),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(5),
+                Key::from_i128(8)..Key::from_i128(10),
+                Key::from_i128(14)..Key::from_i128(15),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -610,7 +637,11 @@ mod tests {
                 Key::from_i128(15)..Key::from_i128(17),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace::default();
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -637,7 +668,17 @@ mod tests {
         let key_space2 = KeySpace {
             ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(9)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(19),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15be6df637..098bad71fb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3859,6 +3859,7 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
+    use pageserver_api::key::NON_INHERITED_RANGE;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4658,6 +4659,62 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+
+        let child_timeline_id = TimelineId::generate();
+        tenant
+            .branch_timeline_test(
+                tline,
+                child_timeline_id,
+                Some(tline.get_last_record_lsn()),
+                &ctx,
+            )
+            .await?;
+
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        let aux_keyspace = KeySpace {
+            ranges: vec![NON_INHERITED_RANGE],
+        };
+        let read_lsn = child_timeline.get_last_record_lsn();
+
+        let vectored_res = child_timeline
+            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
+            .await;
+
+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
+        let images = vectored_res?;
+        let mut key = NON_INHERITED_RANGE.start;
+        while key < NON_INHERITED_RANGE.end {
+            assert!(matches!(
+                images[&key],
+                Err(PageReconstructError::MissingKey(_))
+            ));
+            key = key.next();
+        }
+
+        Ok(())
+    }
+
     // Test that vectored get handles layer gaps correctly
     // by advancing into the next ancestor timeline if required.
     //
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fa7d219fb0..fb5ee0a8fa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::AUX_FILES_KEY,
+    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
     keyspace::KeySpaceAccum,
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -943,7 +943,13 @@ impl Timeline {
                     Err(MissingKey(MissingKeyError {
                         stuck_at_lsn: false,
                         ..
-                    })) => return Err(GetVectoredError::MissingKey(key)),
+                    })) if !NON_INHERITED_RANGE.contains(&key) => {
+                        // The vectored read path handles non inherited keys specially.
+                        // If such a a key cannot be reconstructed from the current timeline,
+                        // the vectored read path returns a key level error as opposed to a top
+                        // level error.
+                        return Err(GetVectoredError::MissingKey(key));
+                    }
                     _ => {
                         values.insert(key, block);
                         key = key.next();
@@ -3024,6 +3030,41 @@ impl Timeline {
             .await?;
 
             keyspace.remove_overlapping_with(&completed);
+
+            // Do not descend into the ancestor timeline for aux files.
+            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
+            // stalling compaction.
+            // TODO(chi): this will need to be updated for aux files v2 storage
+            if keyspace.overlaps(&NON_INHERITED_RANGE) {
+                let removed = keyspace.remove_overlapping_with(&KeySpace {
+                    ranges: vec![NON_INHERITED_RANGE],
+                });
+
+                for range in removed.ranges {
+                    let mut key = range.start;
+                    while key < range.end {
+                        reconstruct_state.on_key_error(
+                            key,
+                            PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: false,
+                                key,
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn,
+                                request_lsn,
+                                ancestor_lsn: None,
+                                traversal_path: Vec::default(),
+                                backtrace: if cfg!(test) {
+                                    Some(std::backtrace::Backtrace::force_capture())
+                                } else {
+                                    None
+                                },
+                            }),
+                        );
+                        key = key.next();
+                    }
+                }
+            }
+
             if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }

From 28e7fa98c4d8f8ef96fd2931f03543f8e06a2389 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Apr 2024 14:05:02 +0100
Subject: [PATCH 061/157] pageserver: add read depth metrics and test (#7464)

## Problem
We recently went through an incident where compaction was inhibited by a
bug. We didn't observe this until quite late because we did not have alerting
on deep reads.

## Summary of changes
+ Tweak an existing metric that tracks the depth of a read on the
non-vectored read path:
  * Give it a better name
  * Track all layers
  * Larger buckets
+ Add a similar metric for the vectored read path
+ Add a compaction smoke test which uses these metrics. This test would
have caught
the compaction issue mentioned earlier.

Related https://github.com/neondatabase/neon/issues/7428
---
 pageserver/src/metrics.rs              | 20 ++++--
 pageserver/src/tenant/storage_layer.rs | 10 +++
 pageserver/src/tenant/timeline.rs      | 15 ++++-
 test_runner/fixtures/metrics.py        |  2 +-
 test_runner/regress/test_compaction.py | 93 ++++++++++++++++++++++++++
 5 files changed, 131 insertions(+), 9 deletions(-)
 create mode 100644 test_runner/regress/test_compaction.py

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e6db95082b..66bf21ddec 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -86,11 +86,20 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "pageserver_read_num_fs_layers",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
     )
     .expect("failed to define a metric")
 });
@@ -2771,7 +2780,8 @@ pub fn preinitialize_metrics() {
 
     // histograms
     [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
+        &VEC_READ_NUM_LAYERS_VISITED,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9a2b086828..9ddd916700 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
 
     keys_done: KeySpaceRandomAccum,
+    layers_visited: u32,
 }
 
 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
+            layers_visited: 0,
         }
     }
 
@@ -138,6 +140,14 @@ impl ValuesReconstructState {
         }
     }
 
+    pub(crate) fn on_layer_visited(&mut self) {
+        self.layers_visited += 1;
+    }
+
+    pub(crate) fn get_layers_visited(&self) -> u32 {
+        self.layers_visited
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fb5ee0a8fa..2fbe3c63a2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -973,6 +973,7 @@ impl Timeline {
             .await?;
 
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        let layers_visited = reconstruct_state.get_layers_visited();
         for (key, res) in reconstruct_state.keys {
             match res {
                 Err(err) => {
@@ -987,6 +988,12 @@ impl Timeline {
             }
         }
 
+        // Note that this is an approximation. Tracking the exact number of layers visited
+        // per key requires virtually unbounded memory usage and is inefficient
+        // (i.e. segment tree tracking each range queried from a layer)
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+            .observe(layers_visited as f64 / results.len() as f64);
+
         Ok(results)
     }
 
@@ -2813,7 +2820,7 @@ impl Timeline {
         let mut timeline = self;
 
         let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
         });
 
         // For debugging purposes, collect the path of layers that we traversed
@@ -2928,7 +2935,7 @@ impl Timeline {
                         Err(e) => return Err(PageReconstructError::from(e)),
                     };
                     cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
+                    *read_count += 1;
                     traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue 'outer;
                 }
@@ -2955,7 +2962,7 @@ impl Timeline {
                         Err(e) => return Err(PageReconstructError::from(e)),
                     };
                     cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
+                    *read_count += 1;
                     traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
                 }
@@ -3183,6 +3190,8 @@ impl Timeline {
 
                 unmapped_keyspace = keyspace_to_read;
                 cont_lsn = next_cont_lsn;
+
+                reconstruct_state.on_layer_visited();
             } else {
                 break;
             }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c615dd154f..7d34e12ca3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_layers_visited_per_read_global"),
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
new file mode 100644
index 0000000000..37b87b92a9
--- /dev/null
+++ b/test_runner/regress/test_compaction.py
@@ -0,0 +1,93 @@
+import os
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.workload import Workload
+
+AGGRESIVE_COMPACTION_TENANT_CONF = {
+    # Disable gc and compaction. The test runs compaction manually.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 2,
+    # INC-186: remove when merging the fix
+    "image_layer_creation_check_threshold": 0,
+}
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+    """
+    This is a smoke test that compaction kicks in. The workload repeatedly churns
+    a small number of rows and manually instructs the pageserver to run compaction
+    between iterations. At the end of the test validate that the average number of
+    layers visited to gather reconstruct data for a given key is within the empirically
+    observed bounds.
+    """
+
+    # Effectively disable the page cache to rely only on image layers
+    # to shorten reads.
+    neon_env_builder.pageserver_config_override = """
+page_cache_size=10
+"""
+
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 100
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(tenant_id, timeline_id)
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    log.info("Checking layer access metrics ...")
+
+    layer_access_metric_names = [
+        "pageserver_layers_visited_per_read_global_sum",
+        "pageserver_layers_visited_per_read_global_count",
+        "pageserver_layers_visited_per_read_global_bucket",
+        "pageserver_layers_visited_per_vectored_read_global_sum",
+        "pageserver_layers_visited_per_vectored_read_global_count",
+        "pageserver_layers_visited_per_vectored_read_global_bucket",
+    ]
+
+    metrics = env.pageserver.http_client().get_metrics()
+    for name in layer_access_metric_names:
+        layer_access_metrics = metrics.query_all(name)
+        log.info(f"Got metrics: {layer_access_metrics}")
+
+    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
+    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
+    non_vectored_average = non_vectored_sum.value / non_vectored_count.value
+
+    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_average = vectored_sum.value / vectored_count.value
+
+    log.info(f"{non_vectored_average=} {vectored_average=}")
+
+    # The upper bound for average number of layer visits below (8)
+    # was chosen empirically for this workload.
+    assert non_vectored_average < 8
+    assert vectored_average < 8

From 8426fb886bcb19e509b2d4d40a0682316163685f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Apr 2024 14:20:12 +0100
Subject: [PATCH 062/157] storage_controller: wait for db on startup (#7479)

## Problem

In some dev/test environments, there aren't health checks to guarantee
the database is available before starting the controller. This creates
friction for the developer.

## Summary of changes

- Wait up to 5 seconds for the database to become available on startup
---
 storage_controller/src/main.rs        |  3 +++
 storage_controller/src/persistence.rs | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 6466b9f7a3..ca55d6c593 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,6 +5,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::sync::Arc;
+use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -245,6 +246,8 @@ async fn async_main() -> anyhow::Result<()> {
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
+    Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
+
     migration_run(&secrets.database_url)
         .await
         .context("Running database migrations")?;
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 5312e1e218..dca37166ba 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -2,6 +2,7 @@ pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
+use std::time::Instant;
 
 use self::split_state::SplitState;
 use camino::Utf8Path;
@@ -144,6 +145,31 @@ impl Persistence {
         }
     }
 
+    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
+    /// database and the storage controller, therefore the database might not be available right away
+    pub async fn await_connection(
+        database_url: &str,
+        timeout: Duration,
+    ) -> Result<(), diesel::ConnectionError> {
+        let started_at = Instant::now();
+        loop {
+            match PgConnection::establish(database_url) {
+                Ok(_) => {
+                    tracing::info!("Connected to database.");
+                    return Ok(());
+                }
+                Err(e) => {
+                    if started_at.elapsed() > timeout {
+                        return Err(e);
+                    } else {
+                        tracing::info!("Database not yet available, waiting... ({e})");
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                }
+            }
+        }
+    }
+
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where

From 89f023e6b0d18f39b08197d0db9875aa1fe924ed Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 23 Apr 2024 11:16:04 -0400
Subject: [PATCH 063/157] feat(pageserver): add metadata key range and aux key
 encoding (#7401)

Extracted from https://github.com/neondatabase/neon/pull/7375. We assume
everything >= 0x80 are metadata keys. AUX file keys are part of the
metadata keys, and we use `0x90` as the prefix for AUX file keys.

The AUX file encoding is described in the code comment. We use xxhash128
as the hash algorithm. It seems to be portable according to the
introduction,

> xxHash is an Extremely fast Hash algorithm, processing at RAM speed
limits. Code is highly portable, and produces hashes identical across
all platforms (little / big endian).

...though whether the Rust version follows the same convention is
unknown and might need manual review of the library. Anyways, we can
always change the hash algorithm before rolling it out in
staging/end-user, and I made a quick decision to use xxhash here because
it generates 128b hash + portable. We can save the discussion of which
hash algorithm to use later.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                          |   1 +
 libs/pageserver_api/src/key.rs      |  93 +++++++++++++++++++++++
 pageserver/Cargo.toml               |   1 +
 pageserver/src/aux_file.rs          | 112 ++++++++++++++++++++++++++++
 pageserver/src/lib.rs               |   1 +
 pageserver/src/pgdatadir_mapping.rs |   2 +-
 6 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/src/aux_file.rs

diff --git a/Cargo.lock b/Cargo.lock
index cff07239e7..85a59ec0ed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3658,6 +3658,7 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
+ "twox-hash",
  "url",
  "utils",
  "walkdir",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 1d66dd8878..01919e8325 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,10 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
+use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -21,9 +23,81 @@ pub struct Key {
     pub field6: u32,
 }
 
+/// The storage key size.
 pub const KEY_SIZE: usize = 18;
 
+/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
+/// See [`Key::to_i128`] for more information on the encoding.
+pub const METADATA_KEY_SIZE: usize = 16;
+
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
+
+/// The (reserved) key prefix of relation sizes.
+pub const RELATION_SIZE_PREFIX: u8 = 0x81;
+
+/// The key prefix of AUX file keys.
+pub const AUX_KEY_PREFIX: u8 = 0x82;
+
+/// Check if the key falls in the range of metadata keys.
+pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
+    key[0] >= METADATA_KEY_BEGIN_PREFIX
+}
+
 impl Key {
+    /// Check if the key falls in the range of metadata keys.
+    pub const fn is_metadata_key(&self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
+        assert!(is_metadata_key_slice(key), "key not in metadata key range");
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key(key: &[u8]) -> Self {
+        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
+    }
+
+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
+    /// Get the range of metadata keys.
+    pub fn metadata_key_range() -> RangeInclusive<Self> {
+        Key {
+            field1: METADATA_KEY_BEGIN_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..=Key {
+            field1: u8::MAX,
+            field2: u16::MAX as u32,
+            field3: u32::MAX,
+            field4: u32::MAX,
+            field5: u8::MAX,
+            field6: u32::MAX,
+        }
+    }
+
     /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -81,6 +155,8 @@ impl Key {
         key
     }
 
+    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::from_metadata_key`] instead.
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -92,6 +168,8 @@ impl Key {
         }
     }
 
+    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
         BE::write_u32(&mut buf[1..5], self.field2);
@@ -558,11 +636,14 @@ impl std::str::FromStr for Key {
 mod tests {
     use std::str::FromStr;
 
+    use crate::key::is_metadata_key_slice;
     use crate::key::Key;
 
     use rand::Rng;
     use rand::SeedableRng;
 
+    use super::AUX_KEY_PREFIX;
+
     #[test]
     fn display_fromstr_bijection() {
         let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -578,4 +659,16 @@ mod tests {
 
         assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
     }
+
+    #[test]
+    fn test_metadata_keys() {
+        let mut metadata_key = vec![AUX_KEY_PREFIX];
+        metadata_key.extend_from_slice(&[0xFF; 15]);
+        let encoded_key = Key::from_metadata_key(&metadata_key);
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        assert_eq!(metadata_key, output_key);
+        assert!(encoded_key.is_metadata_key());
+        assert!(is_metadata_key_slice(&metadata_key));
+    }
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 7a11610a91..4335f38f1e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
new file mode 100644
index 0000000000..aba4ccf19d
--- /dev/null
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,112 @@
+use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
+use tracing::warn;
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
+    let mut key = [0; METADATA_KEY_SIZE];
+    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    key[0] = AUX_KEY_PREFIX;
+    key[1] = dir_level1;
+    key[2] = dir_level2;
+    key[3..16].copy_from_slice(&hash[0..13]);
+    Key::from_metadata_key_fixed_size(&key)
+}
+
+const AUX_DIR_PG_LOGICAL: u8 = 0x01;
+const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
+
+/// Encode the aux file into a fixed-size key.
+///
+/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
+/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
+/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
+/// is roughly based on the first two components of the path, one unique number for one component.
+///
+/// * pg_logical/mappings -> 0x0101
+/// * pg_logical/snapshots -> 0x0102
+/// * pg_logical/replorigin_checkpoint -> 0x0103
+/// * pg_logical/others -> 0x01FF
+/// * pg_replslot/ -> 0x0201
+/// * others -> 0xFFFF
+///
+/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
+/// The new file type must have never been written to the storage before. Otherwise, there could be data
+/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
+pub fn encode_aux_file_key(path: &str) -> Key {
+    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
+    } else if path == "pg_logical/replorigin_checkpoint" {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
+    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_portable() {
+        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
+        // if the algorithm produces the same hash across different environments.
+        assert_eq!(
+            305317690835051308206966631765527126151,
+            twox_hash::xxh3::hash128("test1".as_bytes())
+        );
+        assert_eq!(
+            85104974691013376326742244813280798847,
+            twox_hash::xxh3::hash128("test/test2".as_bytes())
+        );
+        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
+    }
+
+    #[test]
+    fn test_encoding_portable() {
+        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
+        // of the page server.
+        assert_eq!(
+            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+        );
+        assert_eq!(
+            "820000010239AAC544893139B26F501B97E6",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+        );
+        assert_eq!(
+            "820000010300000000000000000000000000",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+        );
+        assert_eq!(
+            "82000001FF8635AF2134B7266EC5B4189FD6",
+            encode_aux_file_key("pg_logical/unsupported").to_string()
+        );
+        assert_eq!(
+            "8200000201772D0E5D71DE14DA86142A1619",
+            encode_aux_file_key("pg_replslot/test3").to_string()
+        );
+        assert_eq!(
+            "820000FFFF1866EBEB53B807B26A2416F317",
+            encode_aux_file_key("other_file_not_supported").to_string()
+        );
+    }
+}
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index f947a75f61..930700e50c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
+pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4a9682dcac..c733b38acb 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1402,7 +1402,7 @@ impl<'a> DatadirModification<'a> {
         let n_files;
         let mut aux_files = self.tline.aux_files.lock().await;
         if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value
+            // We already updated aux files in `self`: emit a delta and update our latest value.
             dir.upsert(file_path.clone(), content.clone());
             n_files = dir.files.len();
             if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {

From e22c072064ac32e4d9af7e6813beeb392f6d5ffe Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Apr 2024 16:24:51 +0100
Subject: [PATCH 064/157] remote_storage: fix prefix handling in remote storage
 & clean up (#7431)

## Problem

Split off from https://github.com/neondatabase/neon/pull/7399, which is
the first piece of code that does a WithDelimiter object listing using a
prefix that isn't a full directory name.

## Summary of changes

- Revise list function to not append a `/` to the prefix -- prefixes
don't have to end with a slash.
- Fix local_fs implementation of list to not assume that WithDelimiter
case will always use a directory as a prerfix.
- Remove `list_files`, `list_prefixes` wrappers, as they add little
value and obscure the underlying list function -- we need callers to
understand the semantics of what they're really calling (listobjectsv2)
---
 libs/remote_storage/src/lib.rs                |  94 ++----
 libs/remote_storage/src/local_fs.rs           | 292 +++++++++++-------
 libs/remote_storage/src/s3_bucket.rs          |  44 ++-
 libs/remote_storage/src/simulate_failures.rs  |  21 --
 libs/remote_storage/tests/common/tests.rs     |  51 ++-
 libs/remote_storage/tests/test_real_azure.rs  |   4 -
 libs/remote_storage/tests/test_real_s3.rs     |  21 +-
 .../src/tenant/remote_timeline_client.rs      |  16 +-
 .../tenant/remote_timeline_client/download.rs |  11 +-
 safekeeper/src/wal_backup.rs                  |  19 +-
 10 files changed, 305 insertions(+), 268 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e708854be2..14c391ca53 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -134,6 +134,11 @@ impl RemotePath {
     pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
         self.0.strip_prefix(&p.0)
     }
+
+    pub fn add_trailing_slash(&self) -> Self {
+        // Unwrap safety inputs are guararnteed to be valid UTF-8
+        Self(format!("{}/", self.0).try_into().unwrap())
+    }
 }
 
 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +162,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
-    /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
-    /// Lists all files in directory "recursively"
-    /// (not really recursively, because AWS has a flat namespace)
-    /// Note: This is subtely different than list_prefixes,
-    /// because it is for listing files instead of listing
-    /// names sharing common prefixes.
-    /// For example,
-    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
-    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
-    /// whereas,
-    /// list_prefixes("foo/bar/") = ["cat", "dog"]
-    /// See `test_real_s3.rs` for more details.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    ///
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
+    /// from the absolute root of the bucket.
+    ///
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
+    /// returned in `keys` ().
+    ///
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
     ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
-            .await?
-            .keys;
-        Ok(result)
-    }
-
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
@@ -336,41 +315,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
-    pub async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
-        }
-    }
-
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 8cad863731..1f7bcfc982 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.
 
 use std::{
-    borrow::Cow,
-    future::Future,
+    collections::HashSet,
     io::ErrorKind,
     num::NonZeroU32,
-    pin::Pin,
     time::{Duration, SystemTime, UNIX_EPOCH},
 };
 
@@ -22,11 +20,11 @@ use tokio::{
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
-use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
+use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
     Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {
 
     #[cfg(test)]
     async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
+        fn get_all_files<'a, P>(
+            directory_path: P,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+        where
+            P: AsRef<Utf8Path> + Send + Sync + 'a,
+        {
+            Box::pin(async move {
+                let directory_path = directory_path.as_ref();
+                if directory_path.exists() {
+                    if directory_path.is_dir() {
+                        let mut paths = Vec::new();
+                        let mut dir_contents = fs::read_dir(directory_path).await?;
+                        while let Some(dir_entry) = dir_contents.next_entry().await? {
+                            let file_type = dir_entry.file_type().await?;
+                            let entry_path =
+                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                                    anyhow::Error::msg(format!(
+                                        "non-Unicode path: {}",
+                                        pb.to_string_lossy()
+                                    ))
+                                })?;
+                            if file_type.is_symlink() {
+                                tracing::debug!("{entry_path:?} is a symlink, skipping")
+                            } else if file_type.is_dir() {
+                                paths.extend(get_all_files(&entry_path).await?.into_iter())
+                            } else {
+                                paths.push(entry_path);
+                            }
+                        }
+                        Ok(paths)
+                    } else {
+                        bail!("Path {directory_path:?} is not a directory")
+                    }
+                } else {
+                    Ok(Vec::new())
+                }
+            })
+        }
+
+        Ok(get_all_files(&self.storage_root)
             .await?
             .into_iter()
             .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
         // S3 object list prefixes can be arbitrary strings, but when reading
         // the local filesystem we need a directory to start calling read_dir on.
         let mut initial_dir = full_path.clone();
+
+        // If there's no trailing slash, we have to start looking from one above: even if
+        // `initial_dir` is a directory, we should still list any prefixes in the parent
+        // that start with the same string.
+        if !full_path.to_string().ends_with('/') {
+            initial_dir.pop();
+        }
+
         loop {
             // Did we make it to the root?
             if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
         let op = async {
             let mut result = Listing::default();
 
-            if let ListingMode::NoDelimiter = mode {
-                let keys = self
-                    .list_recursive(prefix)
-                    .await
-                    .map_err(DownloadError::Other)?;
-
-                result.keys = keys
-                    .into_iter()
-                    .filter(|k| {
-                        let path = k.with_base(&self.storage_root);
-                        !path.is_dir()
-                    })
-                    .collect();
-
-                if let Some(max_keys) = max_keys {
-                    result.keys.truncate(max_keys.get() as usize);
-                }
-
-                return Ok(result);
-            }
-
-            let path = match prefix {
-                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-                None => Cow::Borrowed(&self.storage_root),
-            };
-
-            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
+            let keys = self
+                .list_recursive(prefix)
                 .await
                 .map_err(DownloadError::Other)?;
+            let keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
 
-            // filter out empty directories to mirror s3 behavior.
-            for prefix in prefixes_to_filter {
-                if prefix.is_dir()
-                    && is_directory_empty(&prefix)
-                        .await
-                        .map_err(DownloadError::Other)?
-                {
-                    continue;
-                }
-
-                let stripped = prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    );
-
-                if prefix.is_dir() {
-                    result.prefixes.push(stripped);
-                } else {
-                    result.keys.push(stripped);
+            if let ListingMode::NoDelimiter = mode {
+                result.keys = keys;
+            } else {
+                let mut prefixes = HashSet::new();
+                for key in keys {
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
+                    let relative_key = if let Some(prefix) = prefix {
+                        let mut prefix = prefix.clone();
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
+                        // end up with full file/dir names.
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
+                        let has_slash = prefix.0.to_string().ends_with('/');
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
+                            prefix
+                        } else {
+                            prefix.0.pop();
+                            prefix
+                        };
+
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
+                    } else {
+                        key
+                    };
+
+                    let relative_key = format!("{}", relative_key);
+                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                        let first_part = relative_key
+                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                            .next()
+                            .unwrap()
+                            .to_owned();
+                        prefixes.insert(first_part);
+                    } else {
+                        result
+                            .keys
+                            .push(RemotePath::from_string(&relative_key).unwrap());
+                    }
                 }
+                result.prefixes = prefixes
+                    .into_iter()
+                    .map(|s| RemotePath::from_string(&s).unwrap())
+                    .collect();
             }
 
+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
             Ok(result)
         };
 
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
     path_with_suffix_extension(original_path, "metadata")
 }
 
-fn get_all_files<'a, P>(
-    directory_path: P,
-    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Utf8Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path =
-                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                            anyhow::Error::msg(format!(
-                                "non-Unicode path: {}",
-                                pb.to_string_lossy()
-                            ))
-                        })?;
-                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} is a symlink, skipping")
-                    } else if file_type.is_dir() {
-                        if recursive {
-                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
-                        } else {
-                            paths.push(entry_path)
-                        }
-                    } else {
-                        paths.push(entry_path);
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path {directory_path:?} is not a directory")
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
     let target_dir = match target_file_path.parent() {
         Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
         // No delimiter: should recursively list everything
         let (storage, cancel) = create_storage()?;
         let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let child_sibling =
+            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
         let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
 
         let listing = storage
             .list(None, ListingMode::NoDelimiter, None, &cancel)
             .await?;
         assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
+            listing.keys.into_iter().collect::<HashSet<_>>(),
+            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
+        );
 
         // Delimiter: should only go one deep
         let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
         );
         assert!(listing.keys.is_empty());
 
-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(
+            listing.keys,
+            [RemotePath::from_string("uncle").unwrap()].to_vec()
+        );
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("parent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix without a trailing slash
         let listing = storage
             .list(
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                 &cancel,
             )
             .await?;
+        assert_eq!(listing.keys, [].to_vec());
         assert_eq!(
             listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn list_part_component() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let (storage, cancel) = create_storage()?;
+
+        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
+        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
+        // a freeform prefix.
+        let _child_a =
+            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
+        let _child_b =
+            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(
+                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
+                ),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+
+        let mut found_prefixes = listing.prefixes.clone();
+        found_prefixes.sort();
+        assert_eq!(
+            found_prefixes,
+            [
+                RemotePath::from_string("tenant").unwrap(),
+                RemotePath::from_string("tenant-01").unwrap(),
+            ]
+            .to_vec()
         );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());
 
         Ok(())
     }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 1cb85cfb1b..8091681221 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -178,10 +178,7 @@ impl S3Bucket {
 
     pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
         assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
         match &self.prefix_in_bucket {
             Some(prefix) => prefix.clone() + "/" + path_string,
             None => path_string.to_string(),
@@ -471,16 +468,11 @@ impl RemoteStorage for S3Bucket {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
             .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+            .or_else(|| {
+                self.prefix_in_bucket.clone().map(|mut s| {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    s
+                })
             });
 
         let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +541,15 @@ impl RemoteStorage for S3Bucket {
                 }
             }
 
-            result.prefixes.extend(
-                prefixes
-                    .iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
-            );
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));
 
             continuation_token = match response.next_continuation_token {
                 Some(new_token) => Some(new_token),
@@ -1050,22 +1046,22 @@ mod tests {
             Some("/test/prefix/"),
         ];
         let expected_outputs = [
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["", "some/path", "some/path/"],
+            vec!["/", "/some/path", "/some/path/"],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
         ];
 
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index f5344d3ae2..c467a2d196 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 
 impl RemoteStorage for UnreliableWrapper {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix, cancel).await
-    }
-
-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys, cancel).await
-    }
-
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 72f6f956e0..673151c8ef 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
         .context("common_prefix construction")?;
     let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
-        .await
-        .context("client list root prefixes failure")?
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     );
 
     let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
-        .await
-        .context("client list nested prefixes failure")?
+        .list(
+            Some(&base_prefix.add_trailing_slash()),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .prefixes
         .into_iter()
         .collect::<HashSet<_>>();
     let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
     let ctx = match ctx {
         MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
         MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
         .await
         .context("client list root files failure")?
+        .keys
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
         root_files,
         ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
     );
 
     // Test that max_keys limit works. In total there are about 21 files (see
     // upload_simple_remote_data call in test_real_s3.rs).
     let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
+            None,
+            ListingMode::NoDelimiter,
+            Some(NonZeroU32::new(2).unwrap()),
+            &cancel,
+        )
         .await
         .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);
 
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
         .await
         .context("client list nested files failure")?
+        .keys
         .into_iter()
         .collect::<HashSet<_>>();
     let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
         .collect();
     assert_eq!(
         nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
     );
     Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
 
     ctx.client.delete_objects(&[path1, path2], &cancel).await?;
 
-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
+        .client
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes;
 
     assert_eq!(prefixes.len(), 1);
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6aa02868e6..cd0b2be4b5 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     }
 }
 
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
     Enabled(AzureWithSimpleTestBlobs),
     Disabled,
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index c5d5216f00..01f6a532d6 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         client: &Arc<GenericRemoteStorage>,
         cancel: &CancellationToken,
     ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
-            .await
-            .context("list root files failure")?
-            .into_iter()
-            .collect::<HashSet<_>>())
+        Ok(
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
+                .await
+                .context("list root files failure")?
+                .keys
+                .into_iter()
+                .collect::<HashSet<_>>(),
+        )
     }
 
     let cancel = CancellationToken::new();
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     }
 }
 
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
     Enabled(S3WithSimpleTestBlobs),
     Disabled,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1fa3badefb..d02f00adad 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,7 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
+};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -1145,7 +1147,7 @@ impl RemoteTimelineClient {
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
 
-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
         // taking the burden of listing all the layers that we already know we should delete.
         self.flush_deletion_queue().await?;
 
@@ -1154,14 +1156,20 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None, &cancel)
+                    .list(
+                        Some(&timeline_storage_path),
+                        ListingMode::NoDelimiter,
+                        None,
+                        &cancel,
+                    )
                     .await
             },
             "list remaining files",
             &cancel,
         )
         .await
-        .context("list files remaining files")?;
+        .context("list files remaining files")?
+        .keys;
 
         // We will delete the current index_part object last, since it acts as a deletion
         // marker via its deleted_at attribute
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6ee8ad7155..84692aa577 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
     tenant_shard_id: TenantShardId,
     cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
 
     fail::fail_point!("storage-sync-list-remote-timelines", |_| {
         anyhow::bail!("storage-sync-list-remote-timelines");
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        || async {
+            storage
+                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .await
+        },
         "list index_part files",
         cancel,
     )
-    .await?;
+    .await?
+    .keys;
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index e3f6a606a0..e496f07114 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
+use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
 use tokio::fs::File;
 
 use tokio::select;
@@ -601,12 +601,18 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     backoff::retry(
         || async {
             // Do list-delete in batch_size batches to make progress even if there a lot of files.
-            // Alternatively we could make list_files return iterator, but it is more complicated and
+            // Alternatively we could make remote storage list return iterator, but it is more complicated and
             // I'm not sure deleting while iterating is expected in s3.
             loop {
                 let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
-                    .await?;
+                    .list(
+                        Some(&remote_path),
+                        ListingMode::NoDelimiter,
+                        Some(batch_size),
+                        &cancel,
+                    )
+                    .await?
+                    .keys;
                 if files.is_empty() {
                     return Ok(()); // done
                 }
@@ -666,8 +672,9 @@ pub async fn copy_s3_segments(
     let cancel = CancellationToken::new();
 
     let files = storage
-        .list_files(Some(&remote_path), None, &cancel)
-        .await?;
+        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .await?
+        .keys;
 
     let uploaded_segments = &files
         .iter()

From ee9ec26808d71e441b7d0c96bf9a046ced831f88 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 23 Apr 2024 17:16:17 +0100
Subject: [PATCH 065/157] pageserver: change pitr_interval=0 behavior (#7423)

## Problem

We already made a change in #6407 to make pitr_interval authoritative
for synthetic size calculations (do not charge users for data retained
due to gc_horizon), but that change didn't cover the case where someone
entirely disables time-based retention by setting pitr_interval=0

Relates to: https://github.com/neondatabase/neon/issues/6374

## Summary of changes

When pitr_interval is zero, do not set `pitr_cutoff` based on
gc_horizon.

gc_horizon is still enforced, but separately (its value is passed
separately, there was never a need to claim pitr_cutoff to gc_horizon)

## More detail

### Issue 1
Before this PR, we would skip the update_gc_info for timelines with
last_record_lsn() < gc_horizon.
Let's call such timelines "tiny".

The rationale for that presumably was that we can't GC anything in the
tiny timelines, why bother to call update_gc_info().

However, synthetic size calculation relies on up-to-date
update_gc_info() data.

Before this PR, tiny timelines would never get an updated
GcInfo::pitr_horizon (it remained Lsn(0)).
Even on projects with pitr_interval=0d.

With this PR, update_gc_info is always called, hence
GcInfo::pitr_horizon is always updated, thereby
providing synthetic size calculation with up-to-data data.

### Issue 2
Before this PR, regardless of whether the timeline is "tiny" or not,
GcInfo::pitr_horizon was clamped to at least last_record_lsn -
gc_horizon, even if the pitr window in terms of LSN range was shorter
(=less than) the gc_horizon.

With this PR, that clamping is removed, so, for pitr_interval=0, the
pitr_horizon = last_record_lsn.
---
 pageserver/src/tenant.rs                | 29 +++++-----
 pageserver/src/tenant/timeline.rs       |  5 +-
 test_runner/regress/test_tenant_size.py | 71 +++++--------------------
 3 files changed, 30 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 098bad71fb..15350e93e9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2870,20 +2870,23 @@ impl Tenant {
                 }
             }
 
-            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
-                let branchpoints: Vec<Lsn> = all_branchpoints
-                    .range((
-                        Included((timeline_id, Lsn(0))),
-                        Included((timeline_id, Lsn(u64::MAX))),
-                    ))
-                    .map(|&x| x.1)
-                    .collect();
-                timeline
-                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                    .await?;
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
 
-                gc_timelines.push(timeline);
-            }
+            let branchpoints: Vec<Lsn> = all_branchpoints
+                .range((
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
+                ))
+                .map(|&x| x.1)
+                .collect();
+            timeline
+                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
+                .await?;
+
+            gc_timelines.push(timeline);
         }
         drop(gc_cs);
         Ok(gc_timelines)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2fbe3c63a2..22b8a17874 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4244,9 +4244,8 @@ impl Timeline {
                 *self.get_latest_gc_cutoff_lsn()
             }
         } else {
-            // No time-based retention was configured. Set time-based cutoff to
-            // same as LSN based.
-            cutoff_horizon
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
         };
 
         // Grab the lock and update the values
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 4c8fd4b0e5..a588f6ab53 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -292,33 +292,12 @@ def test_single_branch_get_tenant_size_grows(
     Operate on single branch reading the tenants size after each transaction.
     """
 
-    # Disable automatic gc and compaction.
-    # The pitr_interval here is quite problematic, so we cannot really use it.
-    # it'd have to be calibrated per test executing env.
-
-    # there was a bug which was hidden if the create table and first batch of
-    # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
-    # that there next_gc_cutoff could be smaller than initdb_lsn, which will
-    # obviously lead to issues when calculating the size.
-    gc_horizon = 0x3BA00
-
-    # it's a bit of a hack, but different versions of postgres have different
-    # amount of WAL generated for the same amount of data. so we need to
-    # adjust the gc_horizon accordingly.
-    if pg_version == PgVersion.V14:
-        gc_horizon = 0x4A000
-    elif pg_version == PgVersion.V15:
-        gc_horizon = 0x3BA00
-    elif pg_version == PgVersion.V16:
-        gc_horizon = 210000
-    else:
-        raise NotImplementedError(pg_version)
-
+    # Disable automatic compaction and GC, and set a long PITR interval: we will expect
+    # size to always increase with writes as all writes remain within the PITR
     tenant_config = {
         "compaction_period": "0s",
         "gc_period": "0s",
-        "pitr_interval": "0s",
-        "gc_horizon": gc_horizon,
+        "pitr_interval": "3600s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
@@ -332,18 +311,6 @@ def test_single_branch_get_tenant_size_grows(
 
     size_debug_file = open(test_output_dir / "size_debug.html", "w")
 
-    def check_size_change(
-        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
-    ):
-        if current_lsn - initdb_lsn >= gc_horizon:
-            assert (
-                size >= prev_size
-            ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-        else:
-            assert (
-                size > prev_size
-            ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-
     def get_current_consistent_size(
         env: NeonEnv,
         endpoint: Endpoint,
@@ -412,14 +379,6 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            # branch start shouldn't be past gc_horizon yet
-            # thus the size should grow as we insert more data
-            # "gc_horizon" is tuned so that it kicks in _after_ the
-            # insert phase, but before the update phase ends.
-            assert (
-                current_lsn - initdb_lsn <= gc_horizon
-            ), "Tuning of GC window is likely out-of-date"
             assert size > prev_size
 
             collected_responses.append(("INSERT", current_lsn, size))
@@ -439,8 +398,7 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size
 
             collected_responses.append(("UPDATE", current_lsn, size))
 
@@ -457,8 +415,7 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size
 
             collected_responses.append(("DELETE", current_lsn, size))
 
@@ -469,20 +426,20 @@ def test_single_branch_get_tenant_size_grows(
         with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        # Without setting a PITR interval, dropping the table doesn't reclaim any space
-        # from the user's point of view, because the DROP transaction is too small
-        # to fall out of gc_horizon.
+        # Dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is still
+        # within pitr_interval.
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
         )
-        prev_size = collected_responses[-1][2]
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+        assert size >= prev_size
+        prev_size = size
 
-        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Set a zero PITR interval to allow the DROP to impact the synthetic size
         # Because synthetic size calculation uses pitr interval when available,
         # when our tenant is configured with a tiny pitr interval, dropping a table should
         # cause synthetic size to go down immediately
-        tenant_config["pitr_interval"] = "1ms"
+        tenant_config["pitr_interval"] = "0s"
         env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
@@ -494,10 +451,6 @@ def test_single_branch_get_tenant_size_grows(
         # defined by gc_horizon.
         collected_responses.append(("DROP", current_lsn, size))
 
-    # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    bytes_written = current_lsn - initdb_lsn
-    assert bytes_written > gc_horizon
-
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.

From 18fd73d84afd1086414ba9fae1d08c16660809ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Apr 2024 00:46:48 +0200
Subject: [PATCH 066/157] get_lsn_by_timestamp: clamp commit_lsn to be >=
 min_lsn (#7488)

There was an edge case where
`get_lsn_by_timestamp`/`find_lsn_for_timestamp` could have returned an
lsn that is before the limits we enforce: when we did find SLRU entries
with timestamps before the one we search for.

The API contract of `get_lsn_by_timestamp` is to not return something
before the anchestor lsn.

cc https://neondb.slack.com/archives/C03F5SM1N02/p1713871064147029
---
 pageserver/src/pgdatadir_mapping.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c733b38acb..2c98c0b6c8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -448,6 +448,11 @@ impl Timeline {
         // include physical changes from later commits that will be marked
         // as aborted, and will need to be vacuumed away.
         let commit_lsn = Lsn((low - 1) * 8);
+        // This maxing operation is for the edge case that the search above did
+        // set found_smaller to true but it never increased the lsn. Then, low
+        // is still the old min_lsn the subtraction above could possibly give a value
+        // below the anchestor_lsn.
+        let commit_lsn = commit_lsn.max(min_lsn);
         match (found_smaller, found_larger) {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.

From a60035b23a2f05e512036131f5aef506e583c213 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 24 Apr 2024 11:38:59 +0300
Subject: [PATCH 067/157] fix: avoid starving background task permits in
 eviction task (#7471)

As seen with a recent incident, eviction tasks can cause pageserver-wide
permit starvation on the background task semaphore when synthetic size
calculation takes a long time for a tenant that has more than our permit
number of timelines or multiple tenants that have slow synthetic size
and total number of timelines exceeds the permits. Metric links can be
found in the internal [slack thread].

As a solution, release the permit while waiting for the state guarding
the synthetic size calculation. This will most likely hurt the eviction
task eviction performance, but that does not matter because we are
hoping to get away from it using OnlyImitiate policy anyway and rely
solely on disk usage-based eviction.

[slack thread]:
https://neondb.slack.com/archives/C06UEMLK7FE/p1713810505587809?thread_ts=1713468604.508969&cid=C06UEMLK7FE
---
 pageserver/src/tenant/tasks.rs                |  2 +-
 .../src/tenant/timeline/eviction_task.rs      | 68 ++++++++++++-------
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 74ed677ffe..41b77c1f4a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-) -> impl Drop {
+) -> tokio::sync::SemaphorePermit<'static> {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
         .with_label_values(&[loop_kind.as_static_str()])
         .guard();
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 304d0d60ee..3567761b9a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -188,24 +188,10 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-        );
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
 
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        match self
-            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
-        {
-            ControlFlow::Break(()) => return ControlFlow::Break(()),
-            ControlFlow::Continue(()) => (),
-        }
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await?;
 
         #[derive(Debug, Default)]
         struct EvictionStats {
@@ -330,19 +316,27 @@ impl Timeline {
         gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await
+    }
+
+    async fn acquire_imitation_permit(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
         );
 
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
+        tokio::select! {
+            permit = acquire_permit => ControlFlow::Continue(permit),
+            _ = cancel.cancelled() => ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => ControlFlow::Break(()),
+        }
     }
 
     /// If we evict layers but keep cached values derived from those layers, then
@@ -376,6 +370,7 @@ impl Timeline {
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
+        permit: tokio::sync::SemaphorePermit<'static>,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
         if !self.tenant_shard_id.is_shard_zero() {
@@ -408,7 +403,28 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let mut state = tenant.eviction_task_tenant_state.lock().await;
+        let (mut state, _permit) = {
+            if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
+                (locked, permit)
+            } else {
+                // we might need to wait for a long time here in case of pathological synthetic
+                // size calculation performance
+                drop(permit);
+                let locked = tokio::select! {
+                    locked = tenant.eviction_task_tenant_state.lock() => locked,
+                    _ = self.cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    },
+                    _ = cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    }
+                };
+                // then reacquire -- this will be bad if there is a lot of traffic, but because we
+                // released the permit, the overall latency will be much better.
+                let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+                (locked, permit)
+            }
+        };
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {

From 5dda371c2b75213bb3fa286cc7ba612980379613 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 24 Apr 2024 15:13:18 +0200
Subject: [PATCH 068/157] Fix a bug with retries (#7494)

## Problem

## Summary of changes

By default, it's 5s retry.
---
 proxy/src/bin/proxy.rs       |  7 +++++--
 proxy/src/cache/endpoints.rs | 12 ++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7df320fd42..760ccf40d4 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -403,7 +403,7 @@ async fn main() -> anyhow::Result<()> {
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
         client_tasks.spawn(usage_metrics::task_backup(
             &metrics_config.backup_metric_collection_config,
-            cancellation_token,
+            cancellation_token.clone(),
         ));
     }
 
@@ -423,7 +423,10 @@ async fn main() -> anyhow::Result<()> {
                 let cache = api.caches.endpoints_cache.clone();
                 let con = regional_redis_client;
                 let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
             }
         }
     }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 2aa1986d5e..02511e6ff7 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -4,6 +4,7 @@ use std::{
         atomic::{AtomicBool, Ordering},
         Arc,
     },
+    time::Duration,
 };
 
 use dashmap::DashSet;
@@ -13,6 +14,7 @@ use redis::{
 };
 use serde::Deserialize;
 use tokio::sync::Mutex;
+use tokio_util::sync::CancellationToken;
 use tracing::info;
 
 use crate::{
@@ -111,16 +113,22 @@ impl EndpointsCache {
     pub async fn do_read(
         &self,
         mut con: ConnectionWithCredentialsProvider,
+        cancellation_token: CancellationToken,
     ) -> anyhow::Result<Infallible> {
         let mut last_id = "0-0".to_string();
         loop {
-            self.ready.store(false, Ordering::Release);
             if let Err(e) = con.connect().await {
                 tracing::error!("error connecting to redis: {:?}", e);
-                continue;
+                self.ready.store(false, Ordering::Release);
             }
             if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
                 tracing::error!("error reading from redis: {:?}", e);
+                self.ready.store(false, Ordering::Release);
+            }
+            if cancellation_token.is_cancelled() {
+                info!("cancellation token is cancelled, exiting");
+                tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
+                // 1 week.
             }
             tokio::time::sleep(self.config.retry_interval).await;
         }

From 2a3a8ee31d5ddf98a8b1e335034ddbdd2818dc12 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 24 Apr 2024 14:52:46 +0100
Subject: [PATCH 069/157] pageserver: publish the same metrics from both read
 paths (#7486)

## Problem
Vectored and non-vectored read paths don't publish the same set of
metrics. Metrics parity is needed for coalescing the read paths.

## Summary of changes
* Publish reconstruct time and fetching data for reconstruct time from
the vectored read path
* Remove pageserver_getpage_reconstruct_seconds{res="err"} - wasn't used
anyway
---
 pageserver/src/metrics.rs         | 52 ++++++++++++++++++++++++-------
 pageserver/src/tenant/timeline.rs | 22 +++++++++++--
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 66bf21ddec..6ce7f286b3 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -105,31 +105,39 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
 });
 
 // Metrics collected on operations on the storage repository.
+#[derive(
+    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
+)]
+pub(crate) enum GetKind {
+    Singular,
+    Vectored,
+}
 
 pub(crate) struct ReconstructTimeMetrics {
-    ok: Histogram,
-    err: Histogram,
+    singular: Histogram,
+    vectored: Histogram,
 }
 
 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_getpage_reconstruct_seconds",
         "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["result"],
+        &["get_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric");
+
     ReconstructTimeMetrics {
-        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
-        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
     }
 });
 
 impl ReconstructTimeMetrics {
-    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
-        match result {
-            Ok(_) => &self.ok,
-            Err(_) => &self.err,
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
         }
     }
 }
@@ -142,13 +150,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
     .expect("failed to define a metric")
 });
 
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub(crate) struct ReconstructDataTimeMetrics {
+    singular: Histogram,
+    vectored: Histogram,
+}
+
+impl ReconstructDataTimeMetrics {
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
+        }
+    }
+}
+
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
         "pageserver_getpage_get_reconstruct_data_seconds",
         "Time spent in get_reconstruct_value_data",
+        &["get_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+
+    ReconstructDataTimeMetrics {
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+    }
 });
 
 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 22b8a17874..11d96bf1a6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -797,7 +797,9 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
         let path = self
             .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -807,7 +809,7 @@ impl Timeline {
         let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
         let elapsed = start.elapsed();
         crate::metrics::RECONSTRUCT_TIME
-            .for_result(&res)
+            .for_get_kind(GetKind::Singular)
             .observe(elapsed.as_secs_f64());
 
         if cfg!(feature = "testing") && res.is_err() {
@@ -969,9 +971,22 @@ impl Timeline {
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let mut reconstruct_state = ValuesReconstructState::new();
 
+        let get_kind = if keyspace.total_size() == 1 {
+            GetKind::Singular
+        } else {
+            GetKind::Vectored
+        };
+
+        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
         self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
             .await?;
+        get_data_timer.stop_and_record();
 
+        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
         for (key, res) in reconstruct_state.keys {
@@ -987,6 +1002,7 @@ impl Timeline {
                 }
             }
         }
+        reconstruct_timer.stop_and_record();
 
         // Note that this is an approximation. Tracking the exact number of layers visited
         // per key requires virtually unbounded memory usage and is inefficient

From c12861cccda7c8bc7b57260843102c09be58f733 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 24 Apr 2024 15:36:23 +0100
Subject: [PATCH 070/157] pageserver: finish vectored get early (#7490)

## Problem
If the previous step of the vectored left no further keyspace to
investigate (i.e. keyspace remains empty after removing keys completed in the previous step),
then we'd still grab the layers lock, potentially add an in-mem layer to the fringe
and at some further point read its index without reading any values from it.

## Summary of changes
If there's nothing left in the current keyspace, then skip the search
and just select the next item from the fringe as usual.

When running `test_pg_regress[release-pg16]` with the vectored read path
for singular gets this improved perf drastically (see PR cover letter).

## Correctness
Since no keys remained from the previous range (i.e. we are on a leaf
node) there's nothing that search can find in deeper nodes.
---
 libs/pageserver_api/src/keyspace.rs |  4 ++
 pageserver/src/tenant/timeline.rs   | 92 +++++++++++++++--------------
 2 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 78e4a3d735..a9e19e8cc7 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -162,6 +162,10 @@ impl KeySpace {
             .sum()
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.total_size() == 0
+    }
+
     fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
         match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
             Ok(0) => None,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 11d96bf1a6..703654a37c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3143,55 +3143,61 @@ impl Timeline {
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
 
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
+            // Do not descent any further if the last layer we visited
+            // completed all keys in the keyspace it inspected. This is not
+            // required for correctness, but avoids visiting extra layers
+            // which turns out to be a perf bottleneck in some cases.
+            if !unmapped_keyspace.is_empty() {
+                let guard = timeline.layers.read().await;
+                let layers = guard.layer_map();
 
-            let in_memory_layer = layers.find_in_memory_layer(|l| {
-                let start_lsn = l.get_lsn_range().start;
-                cont_lsn > start_lsn
-            });
+                let in_memory_layer = layers.find_in_memory_layer(|l| {
+                    let start_lsn = l.get_lsn_range().start;
+                    cont_lsn > start_lsn
+                });
 
-            match in_memory_layer {
-                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
-                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
-                        unmapped_keyspace.clone(),
-                        lsn_range,
-                    );
-                }
-                None => {
-                    for range in unmapped_keyspace.ranges.iter() {
-                        let results = layers.range_search(range.clone(), cont_lsn);
+                match in_memory_layer {
+                    Some(l) => {
+                        let lsn_range = l.get_lsn_range().start..cont_lsn;
+                        fringe.update(
+                            ReadableLayer::InMemoryLayer(l),
+                            unmapped_keyspace.clone(),
+                            lsn_range,
+                        );
+                    }
+                    None => {
+                        for range in unmapped_keyspace.ranges.iter() {
+                            let results = layers.range_search(range.clone(), cont_lsn);
 
-                        results
-                            .found
-                            .into_iter()
-                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
-                                )
-                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            results
+                                .found
+                                .into_iter()
+                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                    (
+                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                        keyspace_accum.to_keyspace(),
+                                        lsn_floor..cont_lsn,
+                                    )
+                                })
+                                .for_each(|(layer, keyspace, lsn_range)| {
+                                    fringe.update(layer, keyspace, lsn_range)
+                                });
+                        }
                     }
                 }
-            }
 
-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
+                // It's safe to drop the layer map lock after planning the next round of reads.
+                // The fringe keeps readable handles for the layers which are safe to read even
+                // if layers were compacted or flushed.
+                //
+                // The more interesting consideration is: "Why is the read algorithm still correct
+                // if the layer map changes while it is operating?". Doing a vectored read on a
+                // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+                // covered by the read. The layer map tells us how to move the lsn downwards for a
+                // range at *a particular point in time*. It is fine for the answer to be different
+                // at two different time points.
+                drop(guard);
+            }
 
             if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                 let next_cont_lsn = lsn_range.start;

From 447a063f3c6583ed8e1946900493c1343b1daaef Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 24 Apr 2024 11:09:23 -0400
Subject: [PATCH 071/157] fix(metrics): correct maxrss metrics on macos (#7487)

macOS max_rss is in bytes, while Linux is in kilobytes.
https://stackoverflow.com/a/59915669

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/metrics/src/lib.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 2cf3cdeaa7..8e0dbe6ce4 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -256,7 +256,16 @@ fn update_rusage_metrics() {
     DISK_IO_BYTES
         .with_label_values(&["write"])
         .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
+
+    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
+    #[cfg(target_os = "macos")]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss);
+    }
 }
 
 fn get_rusage_stats() -> libc::rusage {

From c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Apr 2024 18:48:25 +0200
Subject: [PATCH 072/157] Ability to specify the upload_storage_class in S3
 bucket configuration (#7461)

Currently we move data to the intended storage class via lifecycle
rules, but those are a daily batch job so data first spends up to a day
in standard storage.

Therefore, make it possible to specify the storage class used for
uploads to S3 so that the data doesn't have to be migrated
automatically.

The advantage of this is that it gives cleaner billing reports.

Part of https://github.com/neondatabase/cloud/issues/11348
---
 libs/remote_storage/src/lib.rs            | 15 +++++++++++++++
 libs/remote_storage/src/s3_bucket.rs      |  8 +++++++-
 libs/remote_storage/tests/test_real_s3.rs |  1 +
 pageserver/src/config.rs                  |  1 +
 proxy/src/context/parquet.rs              |  1 +
 5 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 14c391ca53..32bc71c513 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,11 +21,13 @@ use std::{
     fmt::Debug,
     num::{NonZeroU32, NonZeroUsize},
     pin::Pin,
+    str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, Context};
+use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
@@ -563,6 +565,7 @@ pub struct S3Config {
     /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
     pub concurrency_limit: NonZeroUsize,
     pub max_keys_per_list_response: Option<i32>,
+    pub upload_storage_class: Option<StorageClass>,
 }
 
 impl Debug for S3Config {
@@ -691,6 +694,18 @@ impl RemoteStorageConfig {
                     endpoint,
                     concurrency_limit,
                     max_keys_per_list_response,
+                    upload_storage_class: toml
+                        .get("upload_storage_class")
+                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
+                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
+                            let storage_class = StorageClass::from_str(&s).expect("infallible");
+                            #[allow(deprecated)]
+                            if matches!(storage_class, StorageClass::Unknown(_)) {
+                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
+                            }
+                            Ok(storage_class)
+                        })
+                        .transpose()?,
                 })
             }
             (_, _, _, Some(_), None) => {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 8091681221..c0b89cee2a 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,7 +30,7 @@ use aws_sdk_s3::{
     config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,6 +62,7 @@ pub struct S3Bucket {
     bucket_name: String,
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
+    upload_storage_class: Option<StorageClass>,
     concurrency_limiter: ConcurrencyLimiter,
     // Per-request timeout. Accessible for tests.
     pub timeout: Duration,
@@ -154,6 +155,7 @@ impl S3Bucket {
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
             concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            upload_storage_class: aws_config.upload_storage_class.clone(),
             timeout,
         })
     }
@@ -582,6 +584,7 @@ impl RemoteStorage for S3Bucket {
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
             .set_metadata(metadata.map(|m| m.0))
+            .set_storage_class(self.upload_storage_class.clone())
             .content_length(from_size_bytes.try_into()?)
             .body(bytes_stream)
             .send();
@@ -633,6 +636,7 @@ impl RemoteStorage for S3Bucket {
             .copy_object()
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
+            .set_storage_class(self.upload_storage_class.clone())
             .copy_source(copy_source)
             .send();
 
@@ -890,6 +894,7 @@ impl RemoteStorage for S3Bucket {
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
+                                    .set_storage_class(self.upload_storage_class.clone())
                                     .copy_source(&source_id)
                                     .send();
 
@@ -1073,6 +1078,7 @@ mod tests {
                 endpoint: None,
                 concurrency_limit: NonZeroUsize::new(100).unwrap(),
                 max_keys_per_list_response: Some(5),
+                upload_storage_class: None,
             };
             let storage =
                 S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 01f6a532d6..a273abe867 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -380,6 +380,7 @@ fn create_s3_client(
             endpoint: None,
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
+            upload_storage_class: None,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index e10db2b853..10d5a22797 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1557,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}'
                         endpoint: Some(endpoint.clone()),
                         concurrency_limit: s3_concurrency_limit,
                         max_keys_per_list_response: None,
+                        upload_storage_class: None,
                     }),
                     timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e061216d15..9600321937 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -413,6 +413,7 @@ mod tests {
                     )
                     .unwrap(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: None,
                 }),
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             })

From e8814b6f81388d389f629a80f3620de99283a79c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 25 Apr 2024 10:46:07 +0100
Subject: [PATCH 073/157] controller: limit Reconciler concurrency (#7493)

## Problem

Storage controller memory can spike very high if we have many tenants
and they all try to reconcile at the same time.

Related:
- https://github.com/neondatabase/neon/issues/7463
- https://github.com/neondatabase/neon/issues/7460

Not closing those issues in this PR, because the test coverage for them
will be in https://github.com/neondatabase/neon/pull/7475

## Summary of changes

- Add a CLI arg `--reconciler-concurrency`, defaulted to 128
- Add a semaphore to Service with this many units
- In `maybe_reconcile_shard`, try to acquire semaphore unit. If we can't
get one, return a ReconcileWaiter for a future sequence number, and push
the TenantShardId onto a channel of delayed IDs.
- In `process_result`, consume from the channel of delayed IDs if there
are semaphore units available and call maybe_reconcile_shard again for
these delayed shards.

This has been tested in https://github.com/neondatabase/neon/pull/7475,
but will land that PR separately because it contains other changes &
needs the test stabilizing. This change is worth merging sooner, because
it fixes a practical issue with larger shard counts.
---
 storage_controller/src/main.rs         |  11 +-
 storage_controller/src/reconciler.rs   |  17 ++++
 storage_controller/src/service.rs      | 124 +++++++++++++++++++++--
 storage_controller/src/tenant_shard.rs | 133 ++++++++++++++++++-------
 4 files changed, 238 insertions(+), 47 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index ca55d6c593..d84803733a 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -9,7 +9,9 @@ use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
-use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
+use storage_controller::service::{
+    Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
@@ -64,6 +66,10 @@ struct Cli {
     /// Grace period before marking unresponsive pageserver offline
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
+
+    /// Maximum number of reconcilers that may run in parallel
+    #[arg(long)]
+    reconciler_concurrency: Option<usize>,
 }
 
 enum StrictMode {
@@ -243,6 +249,9 @@ async fn async_main() -> anyhow::Result<()> {
             .max_unavailable_interval
             .map(humantime::Duration::into)
             .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
+        reconciler_concurrency: args
+            .reconciler_concurrency
+            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 49cfaad569..28801ede6e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -51,6 +51,10 @@ pub(super) struct Reconciler {
     /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
     pub(crate) compute_notify_failure: bool,
 
+    /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many
+    /// we will spawn.
+    pub(crate) _resource_units: ReconcileUnits,
+
     /// A means to abort background reconciliation: it is essential to
     /// call this when something changes in the original TenantShard that
     /// will make this reconciliation impossible or unnecessary, for
@@ -66,6 +70,19 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
+pub(crate) struct ReconcileUnits {
+    _sem_units: tokio::sync::OwnedSemaphorePermit,
+}
+
+impl ReconcileUnits {
+    pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self {
+        Self {
+            _sem_units: sem_units,
+        }
+    }
+}
+
 /// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0565f8e7b4..2e6f3750e7 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -10,8 +10,9 @@ use std::{
 use crate::{
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
-    reconciler::ReconcileError,
+    reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{ScheduleContext, ScheduleMode},
+    tenant_shard::ReconcileNeeded,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -48,7 +49,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
-use tokio::sync::OwnedRwLockWriteGuard;
+use tokio::sync::{mpsc::error::TrySendError, OwnedRwLockWriteGuard};
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
@@ -90,6 +91,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
+pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
+
+// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
+// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
+// than they're being pushed onto the queue.
+const MAX_DELAYED_RECONCILES: usize = 10000;
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantShard>,
@@ -97,6 +105,9 @@ struct ServiceState {
     nodes: Arc<HashMap<NodeId, Node>>,
 
     scheduler: Scheduler,
+
+    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
+    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
 
 impl ServiceState {
@@ -104,11 +115,13 @@ impl ServiceState {
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
+        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
     ) -> Self {
         Self {
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
+            delayed_reconcile_rx,
         }
     }
 
@@ -142,6 +155,9 @@ pub struct Config {
     /// considered active. Once the grace period elapses, the next heartbeat failure will
     /// mark the pagseserver offline.
     pub max_unavailable_interval: Duration,
+
+    /// How many Reconcilers may be spawned concurrently
+    pub reconciler_concurrency: usize,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -180,6 +196,17 @@ pub struct Service {
     // that transition it to/from Active.
     node_op_locks: IdLockMap<NodeId>,
 
+    // Limit how many Reconcilers we will spawn concurrently
+    reconciler_concurrency: Arc<tokio::sync::Semaphore>,
+
+    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
+    /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
+    ///
+    /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler
+    /// by avoiding needing a &mut ref to something inside the ServiceInner.  This could be optimized to
+    /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity.
+    delayed_reconcile_tx: tokio::sync::mpsc::Sender<TenantShardId>,
+
     // Process shutdown will fire this token
     cancel: CancellationToken,
 
@@ -742,8 +769,9 @@ impl Service {
     }
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
-    /// was successful, this will update the observed state of the tenant such that subsequent
-    /// calls to [`TenantShard::maybe_reconcile`] will do nothing.
+    /// was successful and intent hasn't changed since the Reconciler was spawned, this will update
+    /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`]
+    /// will indicate that reconciliation is not needed.
     #[instrument(skip_all, fields(
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
@@ -804,6 +832,21 @@ impl Service {
                 }
             }
         }
+
+        // Maybe some other work can proceed now that this job finished.
+        if self.reconciler_concurrency.available_permits() > 0 {
+            while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
+                let (nodes, tenants, _scheduler) = locked.parts_mut();
+                if let Some(shard) = tenants.get_mut(&tenant_shard_id) {
+                    shard.delayed_reconcile = false;
+                    self.maybe_reconcile_shard(shard, nodes);
+                }
+
+                if self.reconciler_concurrency.available_permits() == 0 {
+                    break;
+                }
+            }
+        }
     }
 
     async fn process_results(
@@ -986,6 +1029,9 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        let (delayed_reconcile_tx, delayed_reconcile_rx) =
+            tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);
+
         let cancel = CancellationToken::new();
         let heartbeater = Heartbeater::new(
             config.jwt_token.clone(),
@@ -994,13 +1040,20 @@ impl Service {
         );
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                nodes, tenants, scheduler,
+                nodes,
+                tenants,
+                scheduler,
+                delayed_reconcile_rx,
             ))),
             config: config.clone(),
             persistence,
-            compute_hook: Arc::new(ComputeHook::new(config)),
+            compute_hook: Arc::new(ComputeHook::new(config.clone())),
             result_tx,
             heartbeater,
+            reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
+                config.reconciler_concurrency,
+            )),
+            delayed_reconcile_tx,
             abort_tx,
             startup_complete: startup_complete.clone(),
             cancel,
@@ -1535,7 +1588,7 @@ impl Service {
 
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+        if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
             // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
             // accept compute notifications while it is in the process of creating.  Reconciliation will
             // be retried in the background.
@@ -4053,20 +4106,64 @@ impl Service {
         Ok(())
     }
 
-    /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
-    /// all the references to parts of Self that are needed
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
     fn maybe_reconcile_shard(
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
     ) -> Option<ReconcilerWaiter> {
-        shard.maybe_reconcile(
+        let reconcile_needed = shard.get_reconcile_needed(nodes);
+
+        match reconcile_needed {
+            ReconcileNeeded::No => return None,
+            ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
+            ReconcileNeeded::Yes => {
+                // Fall through to try and acquire units for spawning reconciler
+            }
+        };
+
+        let units = match self.reconciler_concurrency.clone().try_acquire_owned() {
+            Ok(u) => ReconcileUnits::new(u),
+            Err(_) => {
+                tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(),
+                    "Concurrency limited: enqueued for reconcile later");
+                if !shard.delayed_reconcile {
+                    match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) {
+                        Err(TrySendError::Closed(_)) => {
+                            // Weird mid-shutdown case?
+                        }
+                        Err(TrySendError::Full(_)) => {
+                            // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task.
+                            tracing::warn!(
+                                "Many shards are waiting to reconcile: delayed_reconcile queue is full"
+                            );
+                        }
+                        Ok(()) => {
+                            shard.delayed_reconcile = true;
+                        }
+                    }
+                }
+
+                // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence
+                // number to advance.  When this function is eventually called again and succeeds in getting units,
+                // it will spawn a reconciler that makes this waiter complete.
+                return Some(shard.future_reconcile_waiter());
+            }
+        };
+
+        let Ok(gate_guard) = self.gate.enter() else {
+            // Gate closed: we're shutting down, drop out.
+            return None;
+        };
+
+        shard.spawn_reconciler(
             &self.result_tx,
             nodes,
             &self.compute_hook,
             &self.config,
             &self.persistence,
-            &self.gate,
+            units,
+            gate_guard,
             &self.cancel,
         )
     }
@@ -4088,6 +4185,11 @@ impl Service {
                 schedule_context = ScheduleContext::default();
             }
 
+            // Skip checking if this shard is already enqueued for reconciliation
+            if shard.delayed_reconcile {
+                continue;
+            }
+
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another rone
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 58b8ef8d5d..d69260b9e7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,6 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
+    reconciler::ReconcileUnits,
     scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
@@ -22,7 +23,7 @@ use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
-    sync::gate::Gate,
+    sync::gate::GateGuard,
 };
 
 use crate::{
@@ -95,6 +96,10 @@ pub(crate) struct TenantShard {
     /// reconciliation, and timeline creation.
     pub(crate) splitting: SplitState,
 
+    /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag
+    /// is set. This flag is cleared when the tenant is popped off the delay queue.
+    pub(crate) delayed_reconcile: bool,
+
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
     #[serde(skip)]
@@ -113,8 +118,8 @@ pub(crate) struct TenantShard {
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
-    /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
-    /// sending it.  This is the mechanism by which compute notifications are included in the scope
+    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
+    /// and trigger a Reconciler run.  This is the mechanism by which compute notifications are included in the scope
     /// of state that we publish externally in an eventually consistent way.
     pub(crate) pending_compute_notification: bool,
 
@@ -353,6 +358,17 @@ pub(crate) struct ReconcilerHandle {
     cancel: CancellationToken,
 }
 
+pub(crate) enum ReconcileNeeded {
+    /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler
+    /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it)
+    No,
+    /// shard has a reconciler running, and its intent hasn't changed since that one was
+    /// spawned: wait for the existing reconciler rather than spawning a new one.
+    WaitExisting(ReconcilerWaiter),
+    /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
+    Yes,
+}
+
 /// When a reconcile task completes, it sends this result object
 /// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
@@ -396,6 +412,7 @@ impl TenantShard {
             reconciler: None,
             splitting: SplitState::Idle,
             sequence: Sequence(1),
+            delayed_reconcile: false,
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
@@ -831,16 +848,10 @@ impl TenantShard {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn maybe_reconcile(
+    pub(crate) fn get_reconcile_needed(
         &mut self,
-        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
-        compute_hook: &Arc<ComputeHook>,
-        service_config: &service::Config,
-        persistence: &Arc<Persistence>,
-        gate: &Gate,
-        cancel: &CancellationToken,
-    ) -> Option<ReconcilerWaiter> {
+    ) -> ReconcileNeeded {
         // If there are any ambiguous observed states, and the nodes they refer to are available,
         // we should reconcile to clean them up.
         let mut dirty_observed = false;
@@ -863,7 +874,7 @@ impl TenantShard {
 
         if !do_reconcile {
             tracing::info!("Not dirty, no reconciliation needed.");
-            return None;
+            return ReconcileNeeded::No;
         }
 
         // If we are currently splitting, then never start a reconciler task: the splitting logic
@@ -871,7 +882,7 @@ impl TenantShard {
         // up top, so that we only log this message if we would otherwise have done a reconciliation.
         if !matches!(self.splitting, SplitState::Idle) {
             tracing::info!("Refusing to reconcile, splitting in progress");
-            return None;
+            return ReconcileNeeded::No;
         }
 
         // Reconcile already in flight for the current sequence?
@@ -881,7 +892,7 @@ impl TenantShard {
                     "Reconciliation already in progress for sequence {:?}",
                     self.sequence,
                 );
-                return Some(ReconcilerWaiter {
+                return ReconcileNeeded::WaitExisting(ReconcilerWaiter {
                     tenant_shard_id: self.tenant_shard_id,
                     seq_wait: self.waiter.clone(),
                     error_seq_wait: self.error_waiter.clone(),
@@ -900,10 +911,67 @@ impl TenantShard {
                 // We only reach this point if there is work to do and we're going to skip
                 // doing it: warn it obvious why this tenant isn't doing what it ought to.
                 tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
+                return ReconcileNeeded::No;
             }
         }
 
+        ReconcileNeeded::Yes
+    }
+
+    /// Ensure the sequence number is set to a value where waiting for this value will make us wait
+    /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers.
+    ///
+    /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property
+    /// that the waiter will not complete until some future Reconciler is constructed and run.
+    fn ensure_sequence_ahead(&mut self) {
+        // Find the highest sequence for which a Reconciler has previously run or is currently
+        // running
+        let max_seen = std::cmp::max(
+            self.reconciler
+                .as_ref()
+                .map(|r| r.sequence)
+                .unwrap_or(Sequence(0)),
+            std::cmp::max(self.waiter.load(), self.error_waiter.load()),
+        );
+
+        if self.sequence <= max_seen {
+            self.sequence = max_seen.next();
+        }
+    }
+
+    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
+    ///
+    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
+    /// you would like to wait until one gets spawned in the background.
+    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
+        self.ensure_sequence_ahead();
+
+        ReconcilerWaiter {
+            tenant_shard_id: self.tenant_shard_id,
+            seq_wait: self.waiter.clone(),
+            error_seq_wait: self.error_waiter.clone(),
+            error: self.last_error.clone(),
+            seq: self.sequence,
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn spawn_reconciler(
+        &mut self,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        pageservers: &Arc<HashMap<NodeId, Node>>,
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+        units: ReconcileUnits,
+        gate_guard: GateGuard,
+        cancel: &CancellationToken,
+    ) -> Option<ReconcilerWaiter> {
+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
         // Build list of nodes from which the reconciler should detach
         let mut detach = Vec::new();
         for node_id in self.observed.locations.keys() {
@@ -919,18 +987,9 @@ impl TenantShard {
             }
         }
 
-        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
-        // doing our sequence's work.
-        let old_handle = self.reconciler.take();
-
-        let Ok(gate_guard) = gate.enter() else {
-            // Shutting down, don't start a reconciler
-            return None;
-        };
-
         // Advance the sequence before spawning a reconciler, so that sequence waiters
         // can distinguish between before+after the reconcile completes.
-        self.sequence = self.sequence.next();
+        self.ensure_sequence_ahead();
 
         let reconciler_cancel = cancel.child_token();
         let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
@@ -945,6 +1004,7 @@ impl TenantShard {
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
+            _resource_units: units,
             cancel: reconciler_cancel.clone(),
             persistence: persistence.clone(),
             compute_notify_failure: false,
@@ -1011,16 +1071,18 @@ impl TenantShard {
                         status: outcome_label,
                     });
 
-                result_tx
-                    .send(ReconcileResult {
-                        sequence: reconcile_seq,
-                        result,
-                        tenant_shard_id: reconciler.tenant_shard_id,
-                        generation: reconciler.generation,
-                        observed: reconciler.observed,
-                        pending_compute_notification: reconciler.compute_notify_failure,
-                    })
-                    .ok();
+                // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
+                // try and schedule more work in response to our result.
+                let result = ReconcileResult {
+                    sequence: reconcile_seq,
+                    result,
+                    tenant_shard_id: reconciler.tenant_shard_id,
+                    generation: reconciler.generation,
+                    observed: reconciler.observed,
+                    pending_compute_notification: reconciler.compute_notify_failure,
+                };
+
+                result_tx.send(result).ok();
             }
             .instrument(reconciler_span),
         );
@@ -1111,6 +1173,7 @@ impl TenantShard {
             error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
             last_error: Arc::default(),
             pending_compute_notification: false,
+            delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
         })
     }

From cdccab4bd9f39c4f491df2e3165b8ebc0af3e4bb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 25 Apr 2024 11:14:04 +0100
Subject: [PATCH 074/157] reduce complexity of proxy protocol parse (#7078)

## Problem

The `WithClientIp` AsyncRead/Write abstraction never filled me with much
joy. I would just rather read the protocol header once and then get the
remaining buf and reader.

## Summary of changes

* Replace `WithClientIp::wait_for_addr` with `read_proxy_protocol`.
* Replace `WithClientIp` with `ChainRW`.
* Optimise `ChainRW` to make the standard path more optimal.
---
 proxy/src/protocol2.rs   | 427 +++++++++++++++------------------------
 proxy/src/proxy.rs       |  14 +-
 proxy/src/proxy/tests.rs |   2 +-
 proxy/src/serverless.rs  |   7 +-
 4 files changed, 168 insertions(+), 282 deletions(-)

diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 70f9b4bfab..1dd4563514 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,42 +1,26 @@
 //! Proxy Protocol V2 implementation
 
 use std::{
-    future::{poll_fn, Future},
     io,
     net::SocketAddr,
-    pin::{pin, Pin},
-    task::{ready, Context, Poll},
+    pin::Pin,
+    task::{Context, Poll},
 };
 
-use bytes::{Buf, BytesMut};
-use hyper::server::conn::AddrIncoming;
+use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 
-pub struct ProxyProtocolAccept {
-    pub incoming: AddrIncoming,
-    pub protocol: &'static str,
-}
-
 pin_project! {
-    pub struct WithClientIp<T> {
+    /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
+    pub struct ChainRW<T> {
         #[pin]
         pub inner: T,
         buf: BytesMut,
-        tlv_bytes: u16,
-        state: ProxyParse,
     }
 }
 
-#[derive(Clone, PartialEq, Debug)]
-enum ProxyParse {
-    NotStarted,
-
-    Finished(SocketAddr),
-    None,
-}
-
-impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
+impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
     #[inline]
     fn poll_write(
         self: Pin<&mut Self>,
@@ -71,267 +55,174 @@ impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
     }
 }
 
-impl<T> WithClientIp<T> {
-    pub fn new(inner: T) -> Self {
-        WithClientIp {
-            inner,
-            buf: BytesMut::with_capacity(128),
-            tlv_bytes: 0,
-            state: ProxyParse::NotStarted,
-        }
-    }
-
-    pub fn client_addr(&self) -> Option<SocketAddr> {
-        match self.state {
-            ProxyParse::Finished(socket) => Some(socket),
-            _ => None,
-        }
-    }
-}
-
-impl<T: AsyncRead + Unpin> WithClientIp<T> {
-    pub async fn wait_for_addr(&mut self) -> io::Result<Option<SocketAddr>> {
-        match self.state {
-            ProxyParse::NotStarted => {
-                let mut pin = Pin::new(&mut *self);
-                let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?;
-                match addr {
-                    Some(addr) => self.state = ProxyParse::Finished(addr),
-                    None => self.state = ProxyParse::None,
-                }
-                Ok(addr)
-            }
-            ProxyParse::Finished(addr) => Ok(Some(addr)),
-            ProxyParse::None => Ok(None),
-        }
-    }
-}
-
 /// Proxy Protocol Version 2 Header
 const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
-impl<T: AsyncRead> WithClientIp<T> {
-    /// implementation of <https://www.haproxy.org/download/2.4/doc/proxy-protocol.txt>
-    /// Version 2 (Binary Format)
-    fn poll_client_ip(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<io::Result<Option<SocketAddr>>> {
-        // The binary header format starts with a constant 12 bytes block containing the protocol signature :
-        //    \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A
-        while self.buf.len() < 16 {
-            let mut this = self.as_mut().project();
-            let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?;
+pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
+    mut read: T,
+) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
+    let mut buf = BytesMut::with_capacity(128);
+    while buf.len() < 16 {
+        let bytes_read = read.read_buf(&mut buf).await?;
 
-            // exit for bad header
-            let len = usize::min(self.buf.len(), HEADER.len());
-            if self.buf[..len] != HEADER[..len] {
-                return Poll::Ready(Ok(None));
-            }
-
-            // if no more bytes available then exit
-            if ready!(bytes_read) == 0 {
-                return Poll::Ready(Ok(None));
-            };
+        // exit for bad header
+        let len = usize::min(buf.len(), HEADER.len());
+        if buf[..len] != HEADER[..len] {
+            return Ok((ChainRW { inner: read, buf }, None));
         }
 
-        // The next byte (the 13th one) is the protocol version and command.
-        // The highest four bits contains the version. As of this specification, it must
-        // always be sent as \x2 and the receiver must only accept this value.
-        let vc = self.buf[12];
-        let version = vc >> 4;
-        let command = vc & 0b1111;
-        if version != 2 {
-            return Poll::Ready(Err(io::Error::new(
+        // if no more bytes available then exit
+        if bytes_read == 0 {
+            return Ok((ChainRW { inner: read, buf }, None));
+        };
+    }
+
+    let header = buf.split_to(16);
+
+    // The next byte (the 13th one) is the protocol version and command.
+    // The highest four bits contains the version. As of this specification, it must
+    // always be sent as \x2 and the receiver must only accept this value.
+    let vc = header[12];
+    let version = vc >> 4;
+    let command = vc & 0b1111;
+    if version != 2 {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            "invalid proxy protocol version. expected version 2",
+        ));
+    }
+    match command {
+        // the connection was established on purpose by the proxy
+        // without being relayed. The connection endpoints are the sender and the
+        // receiver. Such connections exist when the proxy sends health-checks to the
+        // server. The receiver must accept this connection as valid and must use the
+        // real connection endpoints and discard the protocol block including the
+        // family which is ignored.
+        0 => {}
+        // the connection was established on behalf of another node,
+        // and reflects the original connection endpoints. The receiver must then use
+        // the information provided in the protocol block to get original the address.
+        1 => {}
+        // other values are unassigned and must not be emitted by senders. Receivers
+        // must drop connections presenting unexpected values here.
+        _ => {
+            return Err(io::Error::new(
                 io::ErrorKind::Other,
-                "invalid proxy protocol version. expected version 2",
-            )));
+                "invalid proxy protocol command. expected local (0) or proxy (1)",
+            ))
         }
-        match command {
-            // the connection was established on purpose by the proxy
-            // without being relayed. The connection endpoints are the sender and the
-            // receiver. Such connections exist when the proxy sends health-checks to the
-            // server. The receiver must accept this connection as valid and must use the
-            // real connection endpoints and discard the protocol block including the
-            // family which is ignored.
-            0 => {}
-            // the connection was established on behalf of another node,
-            // and reflects the original connection endpoints. The receiver must then use
-            // the information provided in the protocol block to get original the address.
-            1 => {}
-            // other values are unassigned and must not be emitted by senders. Receivers
-            // must drop connections presenting unexpected values here.
-            _ => {
-                return Poll::Ready(Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    "invalid proxy protocol command. expected local (0) or proxy (1)",
-                )))
-            }
-        };
+    };
 
-        // The 14th byte contains the transport protocol and address family. The highest 4
-        // bits contain the address family, the lowest 4 bits contain the protocol.
-        let ft = self.buf[13];
-        let address_length = match ft {
-            // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
-            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-            // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
-            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-            0x11 | 0x12 => 12,
-            // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
-            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-            // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
-            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-            0x21 | 0x22 => 36,
-            // unspecified or unix stream. ignore the addresses
-            _ => 0,
-        };
+    // The 14th byte contains the transport protocol and address family. The highest 4
+    // bits contain the address family, the lowest 4 bits contain the protocol.
+    let ft = header[13];
+    let address_length = match ft {
+        // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
+        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+        // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
+        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+        0x11 | 0x12 => 12,
+        // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
+        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+        // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
+        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+        0x21 | 0x22 => 36,
+        // unspecified or unix stream. ignore the addresses
+        _ => 0,
+    };
 
-        // The 15th and 16th bytes is the address length in bytes in network endian order.
-        // It is used so that the receiver knows how many address bytes to skip even when
-        // it does not implement the presented protocol. Thus the length of the protocol
-        // header in bytes is always exactly 16 + this value. When a sender presents a
-        // LOCAL connection, it should not present any address so it sets this field to
-        // zero. Receivers MUST always consider this field to skip the appropriate number
-        // of bytes and must not assume zero is presented for LOCAL connections. When a
-        // receiver accepts an incoming connection showing an UNSPEC address family or
-        // protocol, it may or may not decide to log the address information if present.
-        let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap());
-        if remaining_length < address_length {
-            return Poll::Ready(Err(io::Error::new(
-                io::ErrorKind::Other,
-                "invalid proxy protocol length. not enough to fit requested IP addresses",
-            )));
+    // The 15th and 16th bytes is the address length in bytes in network endian order.
+    // It is used so that the receiver knows how many address bytes to skip even when
+    // it does not implement the presented protocol. Thus the length of the protocol
+    // header in bytes is always exactly 16 + this value. When a sender presents a
+    // LOCAL connection, it should not present any address so it sets this field to
+    // zero. Receivers MUST always consider this field to skip the appropriate number
+    // of bytes and must not assume zero is presented for LOCAL connections. When a
+    // receiver accepts an incoming connection showing an UNSPEC address family or
+    // protocol, it may or may not decide to log the address information if present.
+    let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
+    if remaining_length < address_length {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            "invalid proxy protocol length. not enough to fit requested IP addresses",
+        ));
+    }
+    drop(header);
+
+    while buf.len() < remaining_length as usize {
+        if read.read_buf(&mut buf).await? == 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "stream closed while waiting for proxy protocol addresses",
+            ));
         }
-
-        while self.buf.len() < 16 + address_length as usize {
-            let mut this = self.as_mut().project();
-            if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 {
-                return Poll::Ready(Err(io::Error::new(
-                    io::ErrorKind::UnexpectedEof,
-                    "stream closed while waiting for proxy protocol addresses",
-                )));
-            }
-        }
-
-        let this = self.as_mut().project();
-
-        // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need
-        // discard the header we have parsed
-        this.buf.advance(16);
-
-        // Starting from the 17th byte, addresses are presented in network byte order.
-        // The address order is always the same :
-        //   - source layer 3 address in network byte order
-        //   - destination layer 3 address in network byte order
-        //   - source layer 4 address if any, in network byte order (port)
-        //   - destination layer 4 address if any, in network byte order (port)
-        let addresses = this.buf.split_to(address_length as usize);
-        let socket = match address_length {
-            12 => {
-                let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
-                let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
-                Some(SocketAddr::from((src_addr, src_port)))
-            }
-            36 => {
-                let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
-                let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
-                Some(SocketAddr::from((src_addr, src_port)))
-            }
-            _ => None,
-        };
-
-        *this.tlv_bytes = remaining_length - address_length;
-        self.as_mut().skip_tlv_inner();
-
-        Poll::Ready(Ok(socket))
     }
 
-    #[cold]
-    fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let ip = ready!(self.as_mut().poll_client_ip(cx)?);
-        match ip {
-            Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x),
-            None => *self.as_mut().project().state = ProxyParse::None,
+    // Starting from the 17th byte, addresses are presented in network byte order.
+    // The address order is always the same :
+    //   - source layer 3 address in network byte order
+    //   - destination layer 3 address in network byte order
+    //   - source layer 4 address if any, in network byte order (port)
+    //   - destination layer 4 address if any, in network byte order (port)
+    let addresses = buf.split_to(remaining_length as usize);
+    let socket = match address_length {
+        12 => {
+            let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
+            let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
+            Some(SocketAddr::from((src_addr, src_port)))
         }
-        Poll::Ready(Ok(()))
-    }
+        36 => {
+            let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
+            let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
+            Some(SocketAddr::from((src_addr, src_port)))
+        }
+        _ => None,
+    };
 
-    #[cold]
-    fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let mut this = self.as_mut().project();
-        // we know that this.buf is empty
-        debug_assert_eq!(this.buf.len(), 0);
-
-        this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize);
-        ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?);
-        self.skip_tlv_inner();
-
-        Poll::Ready(Ok(()))
-    }
-
-    fn skip_tlv_inner(self: Pin<&mut Self>) {
-        let tlv_bytes_read = match u16::try_from(self.buf.len()) {
-            // we read more than u16::MAX therefore we must have read the full tlv_bytes
-            Err(_) => self.tlv_bytes,
-            // we might not have read the full tlv bytes yet
-            Ok(n) => u16::min(n, self.tlv_bytes),
-        };
-        let this = self.project();
-        *this.tlv_bytes -= tlv_bytes_read;
-        this.buf.advance(tlv_bytes_read as usize);
-    }
+    Ok((ChainRW { inner: read, buf }, socket))
 }
 
-impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
+impl<T: AsyncRead> AsyncRead for ChainRW<T> {
     #[inline]
     fn poll_read(
-        mut self: Pin<&mut Self>,
+        self: Pin<&mut Self>,
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        // I'm assuming these 3 comparisons will be easy to branch predict.
-        // especially with the cold attributes
-        // which should make this read wrapper almost invisible
-
-        if let ProxyParse::NotStarted = self.state {
-            ready!(self.as_mut().read_ip(cx)?);
-        }
-
-        while self.tlv_bytes > 0 {
-            ready!(self.as_mut().skip_tlv(cx)?)
-        }
-
-        let this = self.project();
-        if this.buf.is_empty() {
-            this.inner.poll_read(cx, buf)
+        if self.buf.is_empty() {
+            self.project().inner.poll_read(cx, buf)
         } else {
-            // we know that tlv_bytes is 0
-            debug_assert_eq!(*this.tlv_bytes, 0);
-
-            let write = usize::min(this.buf.len(), buf.remaining());
-            let slice = this.buf.split_to(write).freeze();
-            buf.put_slice(&slice);
-
-            // reset the allocation so it can be freed
-            if this.buf.is_empty() {
-                *this.buf = BytesMut::new();
-            }
-
-            Poll::Ready(Ok(()))
+            self.read_from_buf(buf)
         }
     }
 }
 
+impl<T: AsyncRead> ChainRW<T> {
+    #[cold]
+    fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll<io::Result<()>> {
+        debug_assert!(!self.buf.is_empty());
+        let this = self.project();
+
+        let write = usize::min(this.buf.len(), buf.remaining());
+        let slice = this.buf.split_to(write).freeze();
+        buf.put_slice(&slice);
+
+        // reset the allocation so it can be freed
+        if this.buf.is_empty() {
+            *this.buf = BytesMut::new();
+        }
+
+        Poll::Ready(Ok(()))
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use std::pin::pin;
-
     use tokio::io::AsyncReadExt;
 
-    use crate::protocol2::{ProxyParse, WithClientIp};
+    use crate::protocol2::read_proxy_protocol;
 
     #[tokio::test]
     async fn test_ipv4() {
@@ -353,16 +244,15 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(
-            read.state,
-            ProxyParse::Finished(([127, 0, 0, 1], 65535).into())
-        );
+        assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
     }
 
     #[tokio::test]
@@ -385,17 +275,17 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
         assert_eq!(
-            read.state,
-            ProxyParse::Finished(
-                ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
-            )
+            addr,
+            Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
         );
     }
 
@@ -403,24 +293,24 @@ mod tests {
     async fn test_invalid() {
         let data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(data.as_slice()));
+        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(read.state, ProxyParse::None);
+        assert_eq!(addr, None);
     }
 
     #[tokio::test]
     async fn test_short() {
         let data = [0x55; 10];
 
-        let mut read = pin!(WithClientIp::new(data.as_slice()));
+        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(read.state, ProxyParse::None);
+        assert_eq!(addr, None);
     }
 
     #[tokio::test]
@@ -446,15 +336,14 @@ mod tests {
 
         let extra_data = [0xaa; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(
-            read.state,
-            ProxyParse::Finished(([55, 56, 57, 58], 65535).into())
-        );
+        assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
     }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index a4554eef38..ddae6536fb 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,7 @@ use crate::{
     context::RequestMonitoring,
     error::ReportableError,
     metrics::{Metrics, NumClientConnectionsGuard},
-    protocol2::WithClientIp,
+    protocol2::read_proxy_protocol,
     proxy::handshake::{handshake, HandshakeData},
     stream::{PqStream, Stream},
     EndpointCacheKey,
@@ -88,20 +88,18 @@ pub async fn task_main(
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
         connections.spawn(async move {
-            let mut socket = WithClientIp::new(socket);
-            let mut peer_addr = peer_addr.ip();
-            match socket.wait_for_addr().await {
-                Ok(Some(addr)) => peer_addr = addr.ip(),
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
+                Ok((socket, Some(addr))) => (socket, addr.ip()),
                 Err(e) => {
                     error!("per-client task finished with an error: {e:#}");
                     return;
                 }
-                Ok(None) if config.require_client_ip => {
+                Ok((_socket, None)) if config.require_client_ip => {
                     error!("missing required client IP");
                     return;
                 }
-                Ok(None) => {}
-            }
+                Ok((socket, None)) => (socket, peer_addr.ip())
+            };
 
             match socket.inner.set_nodelay(true) {
                 Ok(()) => {},
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index e0ec90cb44..ad48af0093 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -174,7 +174,7 @@ async fn dummy_proxy(
     tls: Option<TlsConfig>,
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
-    let client = WithClientIp::new(client);
+    let (client, _) = read_proxy_protocol(client).await?;
     let mut stream = match handshake(client, tls.as_ref(), false).await? {
         HandshakeData::Startup(stream, _) => stream,
         HandshakeData::Cancel(_) => bail!("cancellation not supported"),
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b0f4026c76..1a0d1f7b0e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -33,7 +33,7 @@ use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::WithClientIp;
+use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
@@ -158,9 +158,8 @@ async fn connection_handler(
         .guard(crate::metrics::Protocol::Http);
 
     // handle PROXY protocol
-    let mut conn = WithClientIp::new(conn);
-    let peer = match conn.wait_for_addr().await {
-        Ok(peer) => peer,
+    let (conn, peer) = match read_proxy_protocol(conn).await {
+        Ok(c) => c,
         Err(e) => {
             tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
             return;

From a3d62b31bbafdf15ce6c83ea7bcd594f5870193a Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:16:27 +0200
Subject: [PATCH 075/157] Update connect to compute and wake compute retry
 configs (#7509)

## Problem

## Summary of changes

Decrease waiting time
---
 proxy/src/config.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index ae7606e5d4..a32ab8c43c 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -533,13 +533,13 @@ pub struct RetryConfig {
 impl RetryConfig {
     /// Default options for RetryConfig.
 
-    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
-    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
-    /// Cplane has timeout of 60s on each request.
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    /// Cplane has timeout of 60s on each request. 8m7s in total.
     pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
 
     /// Parse retry options passed via cmdline.
     /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].

From b1d47f39117ed55dfcee7c8afe0b7c32f0336b8e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:38:51 +0200
Subject: [PATCH 076/157] proxy: Fix cancellations (#7510)

## Problem

Cancellations were published to the channel, that was never read.

## Summary of changes

Fallback to global redis publishing.
---
 proxy/src/bin/proxy.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 760ccf40d4..a1b4c21947 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
-    let redis_publisher = match &regional_redis_client {
+    let redis_publisher = match &redis_notifications_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),

From e4a279db132b532c31da97daf09bd133f6c70bcc Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 25 Apr 2024 13:29:17 +0100
Subject: [PATCH 077/157] pageserver: coalesce read paths (#7477)

## Problem
We are currently supporting two read paths. No bueno.

## Summary of changes
High level: use vectored read path to serve get page requests - gated by
`get_impl` config
Low level:
1. Add ps config, `get_impl` to specify which read path to use when
serving get page requests
2. Fix base cached image handling for the vectored read path. This was
subtly broken: previously we
would not mark keys that went past their cached lsn as complete. This is
a self standing change which
could be its own PR, but I've included it here because writing separate
tests for it is tricky.
3. Fork get page to use either the legacy or vectored implementation
4. Validate the use of vectored read path when serving get page requests
against the legacy implementation.
Controlled by `validate_vectored_get` ps config.
5. Use the vectored read path to serve get page requests in tests (with
validation).

## Note
Since the vectored read path does not go through the page cache to read
buffers, this change also amounts to a removal of the buffer page cache. Materialized page cache
is still used.
---
 .github/workflows/build_and_test.yml          |   1 +
 control_plane/src/local_env.rs                |   2 +
 control_plane/src/pageserver.rs               |   7 +
 libs/pageserver_api/src/keyspace.rs           |   5 +
 pageserver/src/bin/pageserver.rs              |   4 +-
 pageserver/src/config.rs                      |  22 ++-
 pageserver/src/tenant.rs                      |  20 ++-
 pageserver/src/tenant/storage_layer.rs        |  36 +++-
 .../src/tenant/storage_layer/delta_layer.rs   |  12 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +
 pageserver/src/tenant/storage_layer/layer.rs  |   6 +
 pageserver/src/tenant/timeline.rs             | 157 +++++++++++++++---
 test_runner/fixtures/neon_fixtures.py         |   7 +
 test_runner/regress/test_broken_timeline.py   |   9 +-
 test_runner/regress/test_compatibility.py     |   5 +
 test_runner/regress/test_local_file_cache.py  |  11 +-
 test_runner/regress/test_lsn_mapping.py       |  10 +-
 test_runner/regress/test_pg_regress.py        |   6 +
 18 files changed, 277 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a7e108fac4..65b573663a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -477,6 +477,7 @@ jobs:
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 38b7fffd09..2168d4b944 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -129,6 +129,7 @@ pub struct PageServerConf {
 
     pub(crate) virtual_file_io_engine: Option<String>,
     pub(crate) get_vectored_impl: Option<String>,
+    pub(crate) get_impl: Option<String>,
 }
 
 impl Default for PageServerConf {
@@ -141,6 +142,7 @@ impl Default for PageServerConf {
             http_auth_type: AuthType::Trust,
             virtual_file_io_engine: None,
             get_vectored_impl: None,
+            get_impl: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index abf815f07a..adac7d7bb5 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -92,6 +92,7 @@ impl PageServerNode {
             http_auth_type,
             virtual_file_io_engine,
             get_vectored_impl,
+            get_impl,
         } = &self.conf;
 
         let id = format!("id={}", id);
@@ -111,6 +112,11 @@ impl PageServerNode {
         } else {
             String::new()
         };
+        let get_impl = if let Some(get_impl) = get_impl {
+            format!("get_impl='{get_impl}'")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -124,6 +130,7 @@ impl PageServerNode {
             broker_endpoint_param,
             virtual_file_io_engine,
             get_vectored_impl,
+            get_impl,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index a9e19e8cc7..f73648d306 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -182,6 +182,11 @@ impl KeySpace {
     pub fn overlaps(&self, range: &Range<Key>) -> bool {
         self.overlaps_at(range).is_some()
     }
+
+    /// Check if the keyspace contains a key
+    pub fn contains(&self, key: &Key) -> bool {
+        self.overlaps(&(*key..key.next()))
+    }
 }
 
 ///
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 41835f9843..1345223a43 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -121,8 +121,10 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
-    // after setting up logging, log the effective IO engine choice
+    // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.get_impl, "starting with get page implementation");
+    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 10d5a22797..96fff1f0c0 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,9 +30,9 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
+use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -91,6 +91,8 @@ pub mod defaults {
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
 
+    pub const DEFAULT_GET_IMPL: &str = "legacy";
+
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
@@ -138,6 +140,8 @@ pub mod defaults {
 
 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
 
+#get_impl = '{DEFAULT_GET_IMPL}'
+
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
@@ -284,6 +288,8 @@ pub struct PageServerConf {
 
     pub get_vectored_impl: GetVectoredImpl,
 
+    pub get_impl: GetImpl,
+
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
 
     pub validate_vectored_get: bool,
@@ -414,6 +420,8 @@ struct PageServerConfigBuilder {
 
     get_vectored_impl: BuilderValue<GetVectoredImpl>,
 
+    get_impl: BuilderValue<GetImpl>,
+
     max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
 
     validate_vectored_get: BuilderValue<bool>,
@@ -503,6 +511,7 @@ impl PageServerConfigBuilder {
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
             get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
@@ -681,6 +690,10 @@ impl PageServerConfigBuilder {
         self.get_vectored_impl = BuilderValue::Set(value);
     }
 
+    pub fn get_impl(&mut self, value: GetImpl) {
+        self.get_impl = BuilderValue::Set(value);
+    }
+
     pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
         self.max_vectored_read_bytes = BuilderValue::Set(value);
     }
@@ -750,6 +763,7 @@ impl PageServerConfigBuilder {
                 secondary_download_concurrency,
                 ingest_batch_size,
                 get_vectored_impl,
+                get_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
@@ -1035,6 +1049,9 @@ impl PageServerConf {
                 "get_vectored_impl" => {
                     builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                 }
+                "get_impl" => {
+                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
+                }
                 "max_vectored_read_bytes" => {
                     let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                     builder.get_max_vectored_read_bytes(
@@ -1126,6 +1143,7 @@ impl PageServerConf {
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
             max_vectored_read_bytes: MaxVectoredReadBytes(
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
@@ -1365,6 +1383,7 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
@@ -1438,6 +1457,7 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15350e93e9..ff6194ab00 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3865,6 +3865,7 @@ mod tests {
     use pageserver_api::key::NON_INHERITED_RANGE;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
+    use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
 
     static TEST_KEY: Lazy<Key> =
@@ -4653,7 +4654,9 @@ mod tests {
         for read in reads {
             info!("Doing vectored read on {:?}", read);
 
-            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
+            let vectored_res = tline
+                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .await;
             tline
                 .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
                 .await;
@@ -4698,7 +4701,12 @@ mod tests {
         let read_lsn = child_timeline.get_last_record_lsn();
 
         let vectored_res = child_timeline
-            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
+            .get_vectored_impl(
+                aux_keyspace.clone(),
+                read_lsn,
+                ValuesReconstructState::new(),
+                &ctx,
+            )
             .await;
 
         child_timeline
@@ -4846,7 +4854,12 @@ mod tests {
             ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
         };
         let results = child_timeline
-            .get_vectored_impl(read.clone(), current_lsn, &ctx)
+            .get_vectored_impl(
+                read.clone(),
+                current_lsn,
+                ValuesReconstructState::new(),
+                &ctx,
+            )
             .await?;
 
         for (key, img_res) in results {
@@ -4979,6 +4992,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
+                    ValuesReconstructState::new(),
                     &ctx,
                 )
                 .await;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9ddd916700..4f1b56ef9f 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -148,6 +148,29 @@ impl ValuesReconstructState {
         self.layers_visited
     }
 
+    /// This function is called after reading a keyspace from a layer.
+    /// It checks if the read path has now moved past the cached Lsn for any keys.
+    ///
+    /// Implementation note: We intentionally iterate over the keys for which we've
+    /// already collected some reconstruct data. This avoids scaling complexity with
+    /// the size of the search space.
+    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
+        for (key, value) in self.keys.iter_mut() {
+            if !keyspace.contains(key) {
+                continue;
+            }
+
+            if let Ok(state) = value {
+                if state.situation != ValueReconstructSituation::Complete
+                    && state.get_cached_lsn() >= Some(advanced_to)
+                {
+                    state.situation = ValueReconstructSituation::Complete;
+                    self.keys_done.add_key(*key);
+                }
+            }
+        }
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
@@ -172,11 +195,18 @@ impl ValuesReconstructState {
                         true
                     }
                     Value::WalRecord(rec) => {
-                        let reached_cache =
-                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
+                        debug_assert!(
+                            Some(lsn) > state.get_cached_lsn(),
+                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
+                            lsn,
+                            state
+                                .get_cached_lsn()
+                                .expect("Assertion can only fire if a cached lsn is present")
+                        );
+
                         let will_init = rec.will_init();
                         state.records.push((lsn, rec));
-                        will_init || reached_cache
+                        will_init
                     }
                 },
             };
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a4b2b4f840..a9f8404158 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -217,6 +217,7 @@ pub struct DeltaLayerInner {
     // values copied from summary
     index_start_blk: u32,
     index_root_blk: u32,
+    lsn_range: Range<Lsn>,
 
     file: VirtualFile,
     file_id: FileId,
@@ -745,6 +746,7 @@ impl DeltaLayerInner {
             file_id,
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
+            lsn_range: actual_summary.lsn_range,
             max_vectored_read_bytes,
         }))
     }
@@ -869,7 +871,7 @@ impl DeltaLayerInner {
         let data_end_offset = self.index_start_offset();
 
         let reads = Self::plan_reads(
-            keyspace,
+            &keyspace,
             lsn_range,
             data_end_offset,
             index_reader,
@@ -883,11 +885,13 @@ impl DeltaLayerInner {
         self.do_reads_and_update_state(reads, reconstruct_state)
             .await;
 
+        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
+
         Ok(())
     }
 
     async fn plan_reads<Reader>(
-        keyspace: KeySpace,
+        keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
         data_end_offset: u64,
         index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
@@ -1535,7 +1539,7 @@ mod test {
 
         // Plan and validate
         let vectored_reads = DeltaLayerInner::plan_reads(
-            keyspace.clone(),
+            &keyspace,
             lsn_range.clone(),
             disk_offset,
             reader,
@@ -1787,7 +1791,7 @@ mod test {
             let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
 
             let vectored_reads = DeltaLayerInner::plan_reads(
-                keyspace.clone(),
+                &keyspace,
                 entries_meta.lsn_range.clone(),
                 data_end_offset,
                 index_reader,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index a86d0d48c5..5939b969d6 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -438,6 +438,8 @@ impl InMemoryLayer {
             }
         }
 
+        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
+
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 316a11f8cc..ee9de8de09 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -336,6 +336,12 @@ impl Layer {
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
+            .map_err(|err| match err {
+                GetVectoredError::Other(err) => GetVectoredError::Other(
+                    err.context(format!("get_values_reconstruct_data for layer {self}")),
+                ),
+                err => err,
+            })
     }
 
     /// Download the layer if evicted.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 703654a37c..f1387e10ac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -119,8 +119,8 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -653,6 +653,19 @@ impl From<GetVectoredError> for CreateImageLayersError {
     }
 }
 
+impl From<GetVectoredError> for PageReconstructError {
+    fn from(e: GetVectoredError) -> Self {
+        match e {
+            GetVectoredError::Cancelled => PageReconstructError::Cancelled,
+            GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
+            err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
+            err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
+            GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
+            GetVectoredError::Other(err) => PageReconstructError::Other(err),
+        }
+    }
+}
+
 impl From<GetReadyAncestorError> for PageReconstructError {
     fn from(e: GetReadyAncestorError) -> Self {
         use GetReadyAncestorError::*;
@@ -682,6 +695,23 @@ pub enum GetVectoredImpl {
     Vectored,
 }
 
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
@@ -743,16 +773,6 @@ impl Timeline {
         key: Key,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-        self.get_impl(key, lsn, ctx).await
-    }
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
@@ -763,13 +783,7 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
+        self.timeline_get_throttle.throttle(ctx, 1).await;
 
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
@@ -792,10 +806,81 @@ impl Timeline {
             None => None,
         };
 
-        let mut reconstruct_state = ValueReconstructState {
-            records: Vec::new(),
-            img: cached_page_img,
-        };
+        match self.conf.get_impl {
+            GetImpl::Legacy => {
+                let reconstruct_state = ValueReconstructState {
+                    records: Vec::new(),
+                    img: cached_page_img,
+                };
+
+                self.get_impl(key, lsn, reconstruct_state, ctx).await
+            }
+            GetImpl::Vectored => {
+                let keyspace = KeySpace {
+                    ranges: vec![key..key.next()],
+                };
+
+                // Initialise the reconstruct state for the key with the cache
+                // entry returned above.
+                let mut reconstruct_state = ValuesReconstructState::new();
+                let mut key_state = VectoredValueReconstructState::default();
+                key_state.img = cached_page_img;
+                reconstruct_state.keys.insert(key, Ok(key_state));
+
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                let key_value = vectored_res?.pop_first();
+                match key_value {
+                    Some((got_key, value)) => {
+                        if got_key != key {
+                            error!(
+                                "Expected {}, but singular vectored get returned {}",
+                                key, got_key
+                            );
+                            Err(PageReconstructError::Other(anyhow!(
+                                "Singular vectored get returned wrong key"
+                            )))
+                        } else {
+                            value
+                        }
+                    }
+                    None => {
+                        error!(
+                            "Expected {}, but singular vectored get returned nothing",
+                            key
+                        );
+                        Err(PageReconstructError::Other(anyhow!(
+                            "Singular vectored get did not return a value for {}",
+                            key
+                        )))
+                    }
+                }
+            }
+        }
+    }
+
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        mut reconstruct_state: ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
 
         let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(GetKind::Singular)
@@ -888,7 +973,9 @@ impl Timeline {
                 self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
             }
             GetVectoredImpl::Vectored => {
-                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .await;
 
                 if self.conf.validate_vectored_get {
                     self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
@@ -934,7 +1021,9 @@ impl Timeline {
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
-                let block = self.get_impl(key, lsn, ctx).await;
+                let block = self
+                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
+                    .await;
 
                 use PageReconstructError::*;
                 match block {
@@ -952,6 +1041,23 @@ impl Timeline {
                         // level error.
                         return Err(GetVectoredError::MissingKey(key));
                     }
+                    Err(Other(err))
+                        if err
+                            .to_string()
+                            .contains("downloading evicted layer file failed") =>
+                    {
+                        return Err(GetVectoredError::Other(err))
+                    }
+                    Err(Other(err))
+                        if err
+                            .chain()
+                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
+                    {
+                        // The intent here is to achieve error parity with the vectored read path.
+                        // When vectored read fails to load a layer it fails the whole read, hence
+                        // we mimic this behaviour here to keep the validation happy.
+                        return Err(GetVectoredError::Other(err));
+                    }
                     _ => {
                         values.insert(key, block);
                         key = key.next();
@@ -967,10 +1073,9 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
+        mut reconstruct_state: ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut reconstruct_state = ValuesReconstructState::new();
-
         let get_kind = if keyspace.total_size() == 1 {
             GetKind::Singular
         } else {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c2c661088b..fcd33bb66a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -507,6 +507,11 @@ class NeonEnvBuilder:
             self.pageserver_get_vectored_impl = "vectored"
             log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
 
+        self.pageserver_get_impl: Optional[str] = None
+        if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
+            self.pageserver_get_impl = "vectored"
+            log.debug('Overriding pageserver get_impl config to "vectored"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1078,6 +1083,8 @@ class NeonEnv:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_get_vectored_impl is not None:
                 ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
+            if config.pageserver_get_impl is not None:
+                ps_cfg["get_impl"] = config.pageserver_get_impl
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 804ad135ce..1279c1bf81 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -17,11 +17,16 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_local_corruption(neon_env_builder: NeonEnvBuilder):
+    if neon_env_builder.pageserver_get_impl == "vectored":
+        reconstruct_function_name = "get_values_reconstruct_data"
+    else:
+        reconstruct_function_name = "get_value_reconstruct_data"
+
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*get_value_reconstruct_data for layer .*",
+            f".*{reconstruct_function_name} for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -84,7 +89,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
+    with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ddad98a5fa..2a371eae72 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -226,6 +226,11 @@ def test_forward_compatibility(
     )
 
     try:
+        # Previous version neon_local and pageserver are not aware
+        # of the new config.
+        # TODO: remove this once the code reaches main
+        neon_env_builder.pageserver_get_impl = None
+
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 38f2034c18..76c6581448 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -4,16 +4,21 @@ import threading
 import time
 from typing import List
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
-def test_local_file_cache_unlink(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
+    env = neon_env_builder.init_start()
 
     cache_dir = os.path.join(env.repo_dir, "file_cache")
     os.mkdir(cache_dir)
 
+    env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
     env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
 
     endpoint = env.endpoints.create_start(
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 5813231aab..37676ab0d4 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,3 +1,4 @@
+import re
 import time
 from datetime import datetime, timedelta, timezone
 
@@ -109,6 +110,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
+    if neon_env_builder.pageserver_get_impl == "vectored":
+        key_not_found_error = r".*Requested key.*not found,*"
+    else:
+        key_not_found_error = r".*could not find data for key.*"
+
     env = neon_env_builder.init_start()
 
     new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
@@ -177,8 +183,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
             raise RuntimeError("there should have been an 'could not find data for key' error")
         except PageserverApiException as error:
             assert error.status_code == 500
-            assert str(error).startswith("could not find data for key")
-            env.pageserver.allowed_errors.append(".*could not find data for key.*")
+            assert re.match(key_not_found_error, str(error))
+            env.pageserver.allowed_errors.append(key_not_found_error)
 
         # Probe a bunch of timestamps in the valid range
         step_size = 100
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index e4219ec7a6..2b1b7fff34 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -18,6 +18,7 @@ from fixtures.remote_storage import s3_storage
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
+    build_type: str,
     pg_bin,
     capsys,
     base_dir: Path,
@@ -30,6 +31,11 @@ def test_pg_regress(
     """
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
+
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)

From 5357f401831a42c7f11adc141ce78d7e795e3bc9 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 17:26:18 +0200
Subject: [PATCH 078/157] proxy: Workaround switch to the regional redis
 (#7513)

## Problem

Start switching from the global redis to the regional one

## Summary of changes

* Publish cancellations to the regional redis
* Listen notifications from both: global and regional
---
 proxy/src/bin/proxy.rs | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index a1b4c21947..39f6bc8b6d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
@@ -409,15 +409,28 @@ async fn main() -> anyhow::Result<()> {
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            if let Some(redis_notifications_client) = redis_notifications_client {
-                let cache = api.caches.project_info.clone();
-                maintenance_tasks.spawn(notifications::task_main(
-                    redis_notifications_client,
-                    cache.clone(),
-                    cancel_map.clone(),
-                    args.region.clone(),
-                ));
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
             }
             if let Some(regional_redis_client) = regional_redis_client {
                 let cache = api.caches.endpoints_cache.clone();

From c59abedd85b81d832225a2490ba066e0c6993fc9 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 25 Apr 2024 12:39:27 -0400
Subject: [PATCH 079/157] chore(pageserver): temporary metrics on ingestion
 time (#7515)

As a follow-up on https://github.com/neondatabase/neon/pull/7467, also
measure the ingestion operation speed.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/metrics.rs           | 66 ++++++++++++++++-------------
 pageserver/src/pgdatadir_mapping.rs |  5 +++
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 6ce7f286b3..e4b314f805 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1519,35 +1519,6 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });
 
-pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
-    pub(crate) records_received: IntCounter,
-    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
-}
-
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
-    records_received: register_int_counter!(
-        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeepers"
-    )
-    .expect("failed to define a metric"),
-    records_committed: register_int_counter!(
-        "pageserver_wal_ingest_records_committed",
-        "Number of WAL records which resulted in writes to pageserver storage"
-    )
-    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
-});
 pub(crate) struct SecondaryModeMetrics {
     pub(crate) upload_heatmap: IntCounter,
     pub(crate) upload_heatmap_errors: IntCounter,
@@ -1749,6 +1720,43 @@ macro_rules! redo_bytes_histogram_count_buckets {
     };
 }
 
+pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+    pub(crate) time_spent_on_ingest: Histogram,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeepers"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+    time_spent_on_ingest: register_histogram!(
+        "pageserver_wal_ingest_put_value_seconds",
+        "Actual time spent on ingesting a record",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric"),
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 2c98c0b6c8..ed1d737583 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::WAL_INGEST;
 use crate::repository::*;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
@@ -1551,6 +1552,8 @@ impl<'a> DatadirModification<'a> {
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = self.tline.writer().await;
 
+        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
+
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
@@ -1590,6 +1593,8 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
+        timer.observe_duration();
+
         Ok(())
     }
 

From 04a682021f34a39a2e1ba36ec8e9e7cf1d911a9c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:36 +0300
Subject: [PATCH 080/157] Remove the now-unused 'latest' arguments (#7377)

The 'latest' argument was passed to the functions in
pgdatadir_mapping.rs to know when they can update the relsize
cache. Commit e69ff3fc00 changed how the relsize cache is updated,
making the 'latest' argument unused.
---
 pageserver/src/basebackup.rs        |   4 +-
 pageserver/src/page_service.rs      |  14 +---
 pageserver/src/pgdatadir_mapping.rs |  14 ++--
 pageserver/src/walingest.rs         | 110 +++++++++++++---------------
 4 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 107758f385..ba047745f1 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -376,7 +376,7 @@ where
     async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
@@ -397,7 +397,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fa6b81ac72..69475c2dc7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -936,7 +936,7 @@ impl PageServerHandler {
                 .await?;
 
         let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -964,7 +964,7 @@ impl PageServerHandler {
                 .await?;
 
         let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -992,13 +992,7 @@ impl PageServerHandler {
                 .await?;
 
         let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::Lsn(lsn),
-                req.latest,
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -1170,7 +1164,7 @@ impl PageServerHandler {
                 .await?;
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ed1d737583..14bcc50e7e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -176,7 +176,6 @@ impl Timeline {
         tag: RelTag,
         blknum: BlockNumber,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
@@ -185,7 +184,7 @@ impl Timeline {
             ));
         }
 
-        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -207,7 +206,6 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
@@ -215,7 +213,7 @@ impl Timeline {
         let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -226,7 +224,6 @@ impl Timeline {
         &self,
         tag: RelTag,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
@@ -240,7 +237,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -263,7 +260,6 @@ impl Timeline {
         &self,
         tag: RelTag,
         version: Version<'_>,
-        _latest: bool,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
@@ -1095,7 +1091,7 @@ impl<'a> DatadirModification<'a> {
     ) -> anyhow::Result<()> {
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
             .await?;
 
         // Remove entry from dbdir
@@ -1194,7 +1190,7 @@ impl<'a> DatadirModification<'a> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
         if self
             .tline
-            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .get_rel_exists(rel, Version::Modified(self), ctx)
             .await?
         {
             let size_key = rel_size_to_key(rel);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 4f83b118ae..79f075b877 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1034,7 +1034,7 @@ impl WalIngest {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -1068,13 +1068,7 @@ impl WalIngest {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(
-                        src_rel,
-                        blknum,
-                        Version::Modified(modification),
-                        true,
-                        ctx,
-                    )
+                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1242,7 +1236,7 @@ impl WalIngest {
                 };
                 if modification
                     .tline
-                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), ctx)
                     .await?
                 {
                     self.put_rel_drop(modification, rel, ctx).await?;
@@ -1541,7 +1535,7 @@ impl WalIngest {
             nblocks
         } else if !modification
             .tline
-            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+            .get_rel_exists(rel, Version::Modified(modification), ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1553,7 +1547,7 @@ impl WalIngest {
         } else {
             modification
                 .tline
-                .get_rel_size(rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(rel, Version::Modified(modification), ctx)
                 .await?
         };
 
@@ -1650,14 +1644,14 @@ async fn get_relsize(
 ) -> anyhow::Result<BlockNumber> {
     let nblocks = if !modification
         .tline
-        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        .get_rel_exists(rel, Version::Modified(modification), ctx)
         .await?
     {
         0
     } else {
         modification
             .tline
-            .get_rel_size(rel, Version::Modified(modification), true, ctx)
+            .get_rel_size(rel, Version::Modified(modification), ctx)
             .await?
     };
     Ok(nblocks)
@@ -1732,29 +1726,29 @@ mod tests {
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
             .await
             .is_err());
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             1
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             3
         );
@@ -1762,46 +1756,46 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             test_img("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 2 at 5")
         );
@@ -1817,19 +1811,19 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
@@ -1837,13 +1831,13 @@ mod tests {
         // should still see the truncated block with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             3
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 2 at 5")
         );
@@ -1856,7 +1850,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
                 .await?,
             0
         );
@@ -1869,19 +1863,19 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             test_img("foo blk 1")
         );
@@ -1894,21 +1888,21 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             1501
         );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             test_img("foo blk 1500")
         );
@@ -1935,13 +1929,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             1
         );
@@ -1954,7 +1948,7 @@ mod tests {
         // Check that rel is not visible anymore
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
                 .await?,
             false
         );
@@ -1972,13 +1966,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             1
         );
@@ -2011,24 +2005,24 @@ mod tests {
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
             .await
             .is_err());
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             relsize
         );
@@ -2039,7 +2033,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2056,7 +2050,7 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             1
         );
@@ -2066,7 +2060,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2075,7 +2069,7 @@ mod tests {
         // should still see all blocks with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             relsize
         );
@@ -2084,7 +2078,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2104,13 +2098,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             relsize
         );
@@ -2120,7 +2114,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2154,7 +2148,7 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE + 1
         );
@@ -2168,7 +2162,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE
         );
@@ -2183,7 +2177,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE - 1
         );
@@ -2201,7 +2195,7 @@ mod tests {
             m.commit(&ctx).await?;
             assert_eq!(
                 tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                     .await?,
                 size as BlockNumber
             );

From 4917f52c8850ac77cc8a42f9916435f5da18f2f4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:42 +0300
Subject: [PATCH 081/157] Server support for new pagestream protocol version
 (#7377)

In the old protocol version, the client sent with each request:

- latest: bool. If true, the client requested the latest page
  version, and the 'lsn' was just a hint of when the page was last
  modified
- lsn: Lsn, the page version to return

This protocol didn't allow requesting a page at a particular
non-latest LSN and *also* sending a hint on when the page was last
modified. That put a read only compute into an awkward position where
it had to either request each page at the replay-LSN, which could be
very close to the last LSN written in the primary and therefore
require the pageserver to wait for it to arrive, or an older LSN which
could already be garbage collected in the pageserver, resulting in an
error. The new protocol version fixes that by allowing a read only
compute to send both LSNs.

To use the new protocol version, use "pagestream_v2" command instead
of just "pagestream". The old protocol version is still supported, for
compatibility with old computes (and in fact there is no client
support yet, it is added by the next commit).
---
 libs/pageserver_api/src/models.rs             | 143 +++++++----
 pageserver/client/src/page_service.rs         |   2 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   8 +-
 pageserver/src/page_service.rs                | 233 +++++++++++-------
 trace/src/main.rs                             |   8 +-
 5 files changed, 254 insertions(+), 140 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e334a68a1e..4ce1ecde26 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -848,39 +848,72 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
     }
 }
 
+// In the V2 protocol version, a GetPage request contains two LSN values:
+//
+// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
+// "get the latest version present". It's used by the primary server, which knows that no one else
+// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
+// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
+//
+// not_modified_since: Hint to the pageserver that the client knows that the page has not been
+// modified between 'not_modified_since' and the request LSN. It's always correct to set
+// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
+// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
+// request without waiting for 'request_lsn' to arrive.
+//
+// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
+// 'latest' was set to true. The V2 interface was added because there was no correct way for a
+// standby to request a page at a particular non-latest LSN, and also include the
+// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
+// request, if the standby knows that the page hasn't been modified since, and risk getting an error
+// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
+// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
+// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
+// difference in the responses between V1 and V2.
+//
+// The Request structs below reflect the V2 interface. If V1 is used, the parse function
+// maps the old format requests to the new format.
+//
+#[derive(Clone, Copy)]
+pub enum PagestreamProtocolVersion {
+    V1,
+    V2,
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
     pub blkno: u32,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub dbnode: u32,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub kind: u8,
     pub segno: u32,
 }
@@ -927,14 +960,16 @@ pub struct TenantHistorySize {
 }
 
 impl PagestreamFeMessage {
+    /// Serialize a compute -> pageserver message. This is currently only used in testing
+    /// tools. Always uses protocol version 2.
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -943,8 +978,8 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -953,8 +988,8 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -964,15 +999,15 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.dbnode);
             }
 
             Self::GetSlruSegment(req) => {
                 bytes.put_u8(4);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u8(req.kind);
                 bytes.put_u32(req.segno);
             }
@@ -981,18 +1016,40 @@ impl PagestreamFeMessage {
         bytes.into()
     }
 
-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
         // these correspond to the NeonMessageTag enum in pagestore_client.h
         //
         // TODO: consider using protobuf or serde bincode for less error prone
         // serialization.
         let msg_tag = body.read_u8()?;
+
+        let (request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V1 => {
+                // In the old protocol, each message starts with a boolean 'latest' flag,
+                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
+                // 'not_modified_since', used in the new protocol version.
+                let latest = body.read_u8()? != 0;
+                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+                if latest {
+                    (Lsn::MAX, request_lsn) // get latest version
+                } else {
+                    (request_lsn, request_lsn) // get version at specified LSN
+                }
+            }
+        };
+
+        // The rest of the messages are the same between V1 and V2
         match msg_tag {
             0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1001,8 +1058,8 @@ impl PagestreamFeMessage {
                 },
             })),
             1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1011,8 +1068,8 @@ impl PagestreamFeMessage {
                 },
             })),
             2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1022,14 +1079,14 @@ impl PagestreamFeMessage {
                 blkno: body.read_u32::<BigEndian>()?,
             })),
             3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 dbnode: body.read_u32::<BigEndian>()?,
             })),
             4 => Ok(PagestreamFeMessage::GetSlruSegment(
                 PagestreamGetSlruSegmentRequest {
-                    latest: body.read_u8()? != 0,
-                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    request_lsn,
+                    not_modified_since,
                     kind: body.read_u8()?,
                     segno: body.read_u32::<BigEndian>()?,
                 },
@@ -1157,8 +1214,8 @@ mod tests {
         // Test serialization/deserialization of PagestreamFeMessage
         let messages = vec![
             PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1167,8 +1224,8 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: false,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(4),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1177,8 +1234,8 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1188,14 +1245,16 @@ mod tests {
                 blkno: 7,
             }),
             PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 dbnode: 7,
             }),
         ];
         for msg in messages {
             let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
+                    .unwrap();
             assert!(msg == reconstructed);
         }
     }
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index 49175b3b90..f9507fc47a 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
     ) -> anyhow::Result<PagestreamClient> {
         let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
             .client
-            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
             .await?;
         let Client {
             cancel_on_client_drop,
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index c3d8e61a2c..5043a207fc 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,8 +312,12 @@ async fn main_impl(
                     let (rel_tag, block_no) =
                         key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                     PagestreamGetPageRequest {
-                        latest: rng.gen_bool(args.req_latest_probability),
-                        lsn: r.timeline_lsn,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
                         rel: rel_tag,
                         blkno: block_no,
                     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 69475c2dc7..96d2397c94 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,13 +1,5 @@
-//
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
-//
-//   It is possible to connect here using usual psql/pgbench/libpq. Following
-// commands are supported now:
-//     *status* -- show actual info about this pageserver,
-//     *pagestream* -- enter mode where smgr and pageserver talk with their
-//  custom protocol.
-//
 
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -23,7 +15,7 @@ use pageserver_api::models::{
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse,
+    PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
@@ -551,6 +543,7 @@ impl PageServerHandler {
         pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        protocol_version: PagestreamProtocolVersion,
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -613,14 +606,15 @@ impl PageServerHandler {
                 t.trace(&copy_data_bytes)
             }
 
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+            let neon_fe_msg =
+                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
             // TODO: We could create a new per-request context here, with unique ID.
             // Currently we use the same per-timeline context for all requests
 
             let (response, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
-                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -629,7 +623,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::Nblocks(req) => {
-                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -639,7 +633,7 @@ impl PageServerHandler {
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -648,7 +642,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::DbSize(req) => {
-                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                     (
                         self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -657,7 +651,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetSlruSegment(req) => {
-                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -838,83 +832,80 @@ impl PageServerHandler {
     /// Helper function to handle the LSN from client request.
     ///
     /// Each GetPage (and Exists and Nblocks) request includes information about
-    /// which version of the page is being requested. The client can request the
-    /// latest version of the page, or the version that's valid at a particular
-    /// LSN. The primary compute node will always request the latest page
-    /// version, while a standby will request a version at the LSN that it's
-    /// currently caught up to.
+    /// which version of the page is being requested. The primary compute node
+    /// will always request the latest page version, by setting 'request_lsn' to
+    /// the last inserted or flushed WAL position, while a standby will request
+    /// a version at the LSN that it's currently caught up to.
     ///
     /// In either case, if the page server hasn't received the WAL up to the
     /// requested LSN yet, we will wait for it to arrive. The return value is
     /// the LSN that should be used to look up the page versions.
+    ///
+    /// In addition to the request LSN, each request carries another LSN,
+    /// 'not_modified_since', which is a hint to the pageserver that the client
+    /// knows that the page has not been modified between 'not_modified_since'
+    /// and the request LSN. This allows skipping the wait, as long as the WAL
+    /// up to 'not_modified_since' has arrived. If the client doesn't have any
+    /// information about when the page was modified, it will use
+    /// not_modified_since == lsn. If the client lies and sends a too low
+    /// not_modified_hint such that there are in fact later page versions, the
+    /// behavior is undefined: the pageserver may return any of the page versions
+    /// or an error.
     async fn wait_or_get_last_lsn(
         timeline: &Timeline,
-        mut lsn: Lsn,
-        latest: bool,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Lsn, PageStreamError> {
-        if latest {
-            // Latest page version was requested. If LSN is given, it is a hint
-            // to the page server that there have been no modifications to the
-            // page after that LSN. If we haven't received WAL up to that point,
-            // wait until it arrives.
-            let last_record_lsn = timeline.get_last_record_lsn();
+        let last_record_lsn = timeline.get_last_record_lsn();
 
-            // Note: this covers the special case that lsn == Lsn(0). That
-            // special case means "return the latest version whatever it is",
-            // and it's used for bootstrapping purposes, when the page server is
-            // connected directly to the compute node. That is needed because
-            // when you connect to the compute node, to receive the WAL, the
-            // walsender process will do a look up in the pg_authid catalog
-            // table for authentication. That poses a deadlock problem: the
-            // catalog table lookup will send a GetPage request, but the GetPage
-            // request will block in the page server because the recent WAL
-            // hasn't been received yet, and it cannot be received until the
-            // walsender completes the authentication and starts streaming the
-            // WAL.
-            if lsn <= last_record_lsn {
-                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
-                // last_record_lsn. That would give the same result, since we know
-                // that there haven't been modifications since 'lsn'. Using an older
-                // LSN might be faster, because that could allow skipping recent
-                // layers when finding the page.
-                lsn = last_record_lsn;
+        // Sanity check the request
+        if request_lsn < not_modified_since {
+            return Err(PageStreamError::BadRequest(
+                format!(
+                    "invalid request with request LSN {} and not_modified_since {}",
+                    request_lsn, not_modified_since,
+                )
+                .into(),
+            ));
+        }
+
+        if request_lsn < **latest_gc_cutoff_lsn {
+            // Check explicitly for INVALID just to get a less scary error message if the
+            // request is obviously bogus
+            return Err(if request_lsn == Lsn::INVALID {
+                PageStreamError::BadRequest("invalid LSN(0) in request".into())
             } else {
-                timeline
-                    .wait_lsn(
-                        lsn,
-                        crate::tenant::timeline::WaitLsnWaiter::PageService,
-                        ctx,
-                    )
-                    .await?;
-                // Since we waited for 'lsn' to arrive, that is now the last
-                // record LSN. (Or close enough for our purposes; the
-                // last-record LSN can advance immediately after we return
-                // anyway)
-            }
-        } else {
-            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
-            }
+                PageStreamError::BadRequest(format!(
+                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                        request_lsn, **latest_gc_cutoff_lsn
+                    ).into())
+            });
+        }
+
+        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
+        if not_modified_since > last_record_lsn {
             timeline
                 .wait_lsn(
-                    lsn,
+                    not_modified_since,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
                     ctx,
                 )
                 .await?;
+            // Since we waited for 'not_modified_since' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+            Ok(not_modified_since)
+        } else {
+            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
+            // here instead. That would give the same result, since we know that there
+            // haven't been any modifications since 'not_modified_since'. Using an older
+            // LSN might be faster, because that could allow skipping recent layers when
+            // finding the page. However, we have historically used 'last_record_lsn', so
+            // stick to that for now.
+            Ok(std::cmp::min(last_record_lsn, request_lsn))
         }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
-        Ok(lsn)
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -931,9 +922,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let exists = timeline
             .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
@@ -959,9 +955,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let n_blocks = timeline
             .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
@@ -987,9 +988,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let total_blocks = timeline
             .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
@@ -1159,9 +1165,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let page = timeline
             .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
@@ -1187,9 +1198,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let kind = SlruKind::from_repr(req.kind)
             .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1407,7 +1423,34 @@ where
 
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream ") {
+        if query_string.starts_with("pagestream_v2 ") {
+            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V2,
+                ctx,
+            )
+            .await?;
+        } else if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
             if params.len() != 2 {
@@ -1426,8 +1469,14 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
-                .await?;
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V1,
+                ctx,
+            )
+            .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
diff --git a/trace/src/main.rs b/trace/src/main.rs
index 4605c124e9..049f922b6f 100644
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -7,7 +7,9 @@ use std::{
     io::BufReader,
 };
 
-use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest};
+use pageserver_api::models::{
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion,
+};
 use utils::id::{ConnectionId, TenantId, TimelineId};
 
 use clap::{Parser, Subcommand};
@@ -56,7 +58,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
     let mut prev: Option<PagestreamGetPageRequest> = None;
 
     // Compute stats
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) {
+    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
         match msg {
             PagestreamFeMessage::Exists(_) => {}
             PagestreamFeMessage::Nblocks(_) => {}
@@ -89,7 +91,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
 }
 
 fn dump_trace<R: std::io::Read>(mut reader: R) {
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) {
+    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
         println!("{msg:?}");
     }
 }

From a2a44ea213905ecd0f20b38f41a5725138214ee0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:45 +0300
Subject: [PATCH 082/157] Refactor how the request LSNs are tracked in compute
 (#7377)

Instead of thinking in terms of 'latest' and 'lsn' of the request,
each request has two LSNs: the request LSN and 'not_modified_since'
LSN. The request is nominally made at the request LSN, that determines
what page version we want to see. But as a hint, we also include
'not_modified_since'. It tells the pageserver that the page has not
been modified since that LSN, which allows the pageserver to skip
waiting for newer WAL to arrive, and could allow more optimizations in
the future.

Refactor the internal functions to calculate the request LSN to
calculate both LSNs.

Sending two LSNs to the pageserver requires using the new protocol
version 2. The previous commit added the server support for it, but we
still default to the old protocol for compatibility with old
pageservers. The 'neon.protocol_version' GUC can be used to use the
new protocol.

The new protocol addresses one cause of issue #6211, although you can
still get the same error if you have a standby that is lagging behind
so that the page version it needs is genuinely GC'd away.
---
 pgxn/neon/libpagestore.c                      |  26 +-
 pgxn/neon/pagestore_client.h                  |  34 +-
 pgxn/neon/pagestore_smgr.c                    | 438 +++++++++++-------
 pgxn/neon_test_utils/Makefile                 |   2 +-
 ...tils--1.0.sql => neon_test_utils--1.1.sql} |   4 +-
 pgxn/neon_test_utils/neon_test_utils.control  |   2 +-
 pgxn/neon_test_utils/neontest.c               |  29 +-
 test_runner/regress/test_read_validation.py   |  40 +-
 test_runner/regress/test_vm_bits.py           |   4 +-
 9 files changed, 377 insertions(+), 202 deletions(-)
 rename pgxn/neon_test_utils/{neon_test_utils--1.0.sql => neon_test_utils--1.1.sql} (89%)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 2276b4e807..b7b1e7ccbf 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,6 +49,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
+int         neon_protocol_version = 1;
+
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
@@ -379,7 +381,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		pfree(msg);
 		return false;
 	}
-	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+	switch (neon_protocol_version)
+	{
+		case 2:
+			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			break;
+		case 1:
+			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			break;
+		default:
+			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
+	}
 	ret = PQsendQuery(conn, query);
 	pfree(query);
 	if (ret != 1)
@@ -440,7 +452,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		return false;
 	}
 
-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;
 
@@ -844,6 +856,16 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
+	DefineCustomIntVariable("neon.protocol_version",
+							"Version of compute<->page server protocol",
+							NULL,
+							&neon_protocol_version,
+							1, /* default to old protocol for now */
+							1, /* min */
+							2, /* max */
+							PGC_SU_BACKEND,
+							0,	/* no flags required */
+							NULL, NULL, NULL);
 
 	relsize_hash_init();
 
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 44ae766f76..7709ab9d42 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -69,18 +69,33 @@ typedef enum {
 	SLRU_MULTIXACT_OFFSETS
 } SlruKind;
 
-/*
- * supertype of all the Neon*Request structs below
+/*--
+ * supertype of all the Neon*Request structs below.
  *
- * If 'latest' is true, we are requesting the latest page version, and 'lsn'
- * is just a hint to the server that we know there are no versions of the page
- * (or relation size, for exists/nblocks requests) later than the 'lsn'.
+ * All requests contain two LSNs:
+ *
+ * lsn:                request page (or relation size, etc) at this LSN
+ * not_modified_since: Hint that the page hasn't been modified between
+ *                     this LSN and the request LSN (`lsn`).
+ *
+ * To request the latest version of a page, you can use MAX_LSN as the request
+ * LSN.
+ *
+ * If you don't know any better, you can always set 'not_modified_since' equal
+ * to 'lsn', but providing a lower value can speed up processing the request
+ * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it
+ * can skip traversing through recent layers which we know to not contain any
+ * versions for the requested page.
+ *
+ * These structs describe the V2 of these requests. The old V1 protocol contained
+ * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
+ * set to 1, we will convert these to the V1 requests before sending.
  */
 typedef struct
 {
 	NeonMessageTag tag;
-	bool		latest;			/* if true, request latest page version */
-	XLogRecPtr	lsn;			/* request page version @ this LSN */
+	XLogRecPtr	lsn;
+	XLogRecPtr	not_modified_since;
 } NeonRequest;
 
 typedef struct
@@ -193,6 +208,7 @@ extern int	readahead_buffer_size;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
+extern int  neon_protocol_version;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
@@ -225,14 +241,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 57a16e00ca..44ecdbd9aa 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,8 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	effective_request_lsn;
-	XLogRecPtr	actual_request_lsn;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -269,19 +269,19 @@ static PrefetchState *MyPState;
 	) \
 )
 
-static XLogRecPtr prefetch_lsn = 0;
-
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
+static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
+static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
-static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
-									   ForkNumber forknum, BlockNumber blkno);
+static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
+static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+										  PrefetchRequest *slot);
 
 static bool
 compact_prefetch_buffers(void)
@@ -338,8 +338,8 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
-		target_slot->actual_request_lsn = source_slot->actual_request_lsn;
+		target_slot->request_lsn = source_slot->request_lsn;
+		target_slot->not_modified_since = source_slot->not_modified_since;
 		target_slot->my_ring_index = empty_ring_index;
 
 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,7 +358,8 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->effective_request_lsn = 0;
+		source_slot->request_lsn = InvalidXLogRecPtr;
+		source_slot->not_modified_since = InvalidXLogRecPtr;
 
 		/* update bookkeeping */
 		n_moved++;
@@ -683,56 +684,39 @@ prefetch_set_unused(uint64 ring_index)
 		compact_prefetch_buffers();
 }
 
+/*
+ * Send one prefetch request to the pageserver. To wait for the response, call
+ * prefetch_wait_for().
+ */
 static void
-prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
+prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
 {
 	bool		found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
-		.req.latest = false,
-		.req.lsn = 0,
+		/* lsn and not_modified_since are filled in below */
 		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
 
-	if (force_lsn && force_latest)
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
+	if (force_request_lsn)
 	{
-		request.req.lsn = *force_lsn;
-		request.req.latest = *force_latest;
-		slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
+		request.req.lsn = *force_request_lsn;
+		request.req.not_modified_since = *force_not_modified_since;
 	}
 	else
 	{
-		XLogRecPtr	lsn = neon_get_request_lsn(
-											   &request.req.latest,
-											   BufTagGetNRelFileInfo(slot->buftag),
-											   slot->buftag.forkNum,
-											   slot->buftag.blockNum
-			);
-
-		/*
-		 * Note: effective_request_lsn is potentially higher than the
-		 * requested LSN, but still correct:
-		 *
-		 * We know there are no changes between the actual requested LSN and
-		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which then
-		 * would have had to result in a larger request LSN for this page.
-		 *
-		 * It is possible that a concurrent backend loads the page, modifies
-		 * it and then evicts it again, but the LSN of that eviction cannot be
-		 * smaller than the current WAL insert/redo pointer, which is already
-		 * larger than this prefetch_lsn. So in any case, that would
-		 * invalidate this cache.
-		 *
-		 * The best LSN to use for effective_request_lsn would be
-		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
-		 */
-		slot->actual_request_lsn = request.req.lsn = lsn;
-		prefetch_lsn = Max(prefetch_lsn, lsn);
-		slot->effective_request_lsn = prefetch_lsn;
+		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
+							 slot->buftag.forkNum,
+							 slot->buftag.blockNum,
+							 &request.req.lsn,
+							 &request.req.not_modified_since);
 	}
+	slot->request_lsn = request.req.lsn;
+	slot->not_modified_since = request.req.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -749,7 +733,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
 
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -759,22 +742,25 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
  *
  * Register that we may want the contents of BufferTag in the near future.
  *
- * If force_latest and force_lsn are not NULL, those values are sent to the
- * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
- * to fill in these values manually.
+ * If force_request_lsn and force_not_modified_since are not NULL, those
+ * values are sent to the pageserver. If they are NULL, we utilize the
+ * lastWrittenLsn -infrastructure to fill them in.
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
+prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
+						 XLogRecPtr *force_not_modified_since)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -792,40 +778,19 @@ Retry:
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
 
 		/*
-		 * If we want a specific lsn, we do not accept requests that were made
-		 * with a potentially different LSN.
+		 * If the caller specified a request LSN to use, only accept prefetch
+		 * responses that satisfy that request.
 		 */
-		if (force_latest && force_lsn)
+		if (force_request_lsn)
 		{
-			/*
-			 * if we want the latest version, any effective_request_lsn <
-			 * request lsn is OK
-			 */
-			if (*force_latest)
+			if (!neon_prefetch_response_usable(*force_request_lsn,
+											   *force_not_modified_since, slot))
 			{
-				if (*force_lsn > slot->effective_request_lsn)
-				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-				}
-
-			}
-
-			/*
-			 * if we don't want the latest version, only accept requests with
-			 * the exact same LSN
-			 */
-			else
-			{
-				if (*force_lsn != slot->effective_request_lsn)
-				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-				}
+				/* Wait for the old request to finish and discard it */
+				if (!prefetch_wait_for(ring_index))
+					goto Retry;
+				prefetch_set_unused(ring_index);
+				entry = NULL;
 			}
 		}
 
@@ -921,7 +886,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
-	prefetch_do_request(slot, force_latest, force_lsn);
+	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -950,7 +915,7 @@ page_server_request(void const *req)
 	BufferTag tag = {0};
 	shardno_t shard_no;
 
-	switch (((NeonRequest *) req)->tag)
+	switch (messageTag(req))
 	{
 		case T_NeonExistsRequest:
 			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
@@ -966,11 +931,10 @@ page_server_request(void const *req)
 			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
 			break;
 		default:
-			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
 	}
 	shard_no = get_shard_number(&tag);
 
-
 	/*
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
@@ -997,8 +961,52 @@ nm_pack_request(NeonRequest *msg)
 	StringInfoData s;
 
 	initStringInfo(&s);
-	pq_sendbyte(&s, msg->tag);
 
+	if (neon_protocol_version >= 2)
+	{
+		pq_sendbyte(&s, msg->tag);
+		pq_sendint64(&s, msg->lsn);
+		pq_sendint64(&s, msg->not_modified_since);
+	}
+	else
+	{
+		bool		latest;
+		XLogRecPtr	lsn;
+
+		/*
+		 * In primary, we always request the latest page version.
+		 */
+		if (!RecoveryInProgress())
+		{
+			latest = true;
+			lsn = msg->not_modified_since;
+		}
+		else
+		{
+			/*
+			 * In the protocol V1, we cannot represent that we want to read
+			 * page at LSN X, and we know that it hasn't been modified since
+			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
+			 * risk getting an error if that LSN is too old and has already
+			 * fallen out of the pageserver's GC horizon, or we can send
+			 * 'request_lsn', causing the pageserver to possibly wait for the
+			 * recent WAL to arrive unnecessarily. Or something in between. We
+			 * choose to use the old LSN and risk GC errors, because that's
+			 * what we've done historically.
+			 */
+			latest = false;
+			lsn = msg->not_modified_since;
+		}
+
+		pq_sendbyte(&s, msg->tag);
+		pq_sendbyte(&s, latest);
+		pq_sendint64(&s, lsn);
+	}
+
+	/*
+	 * The rest of the request messages are the same between protocol V1 and
+	 * V2
+	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
@@ -1006,8 +1014,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1019,8 +1025,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1032,8 +1036,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, msg_req->dbNode);
 
 				break;
@@ -1042,8 +1044,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1057,8 +1057,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendbyte(&s, msg_req->kind);
 				pq_sendint32(&s, msg_req->segno);
 
@@ -1209,7 +1207,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1222,7 +1220,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1236,7 +1234,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1247,7 +1245,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
 				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1259,7 +1257,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
 				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1531,44 +1529,38 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static XLogRecPtr
-neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
+static void
+neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
 {
-	XLogRecPtr	lsn;
+	XLogRecPtr	last_written_lsn;
+
+	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
+	last_written_lsn = nm_adjust_lsn(last_written_lsn);
+	Assert(last_written_lsn != InvalidXLogRecPtr);
 
 	if (RecoveryInProgress())
 	{
-		/*
-		 * We don't know if WAL has been generated but not yet replayed, so
-		 * we're conservative in our estimates about latest pages.
-		 */
-		*latest = false;
+		/* Request the page at the last replayed LSN. */
+		*request_lsn = GetXLogReplayRecPtr(NULL);
+		*not_modified_since = last_written_lsn;
+		Assert(last_written_lsn <= *request_lsn);
 
-		/*
-		 * Get the last written LSN of this page.
-		 */
-		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-		lsn = nm_adjust_lsn(lsn);
-
-		neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
+		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
 	}
 	else
 	{
 		XLogRecPtr	flushlsn;
 
 		/*
-		 * Use the latest LSN that was evicted from the buffer cache. Any
-		 * pages modified by later WAL records must still in the buffer cache,
-		 * so our request cannot concern those.
+		 * Use the latest LSN that was evicted from the buffer cache as the
+		 * 'not_modified_since' hint. Any pages modified by later WAL records
+		 * must still in the buffer cache, so our request cannot concern
+		 * those.
 		 */
-		*latest = true;
-		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-		Assert(lsn != InvalidXLogRecPtr);
 		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-
-		lsn = nm_adjust_lsn(lsn);
+				 LSN_FORMAT_ARGS(last_written_lsn));
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush
@@ -1583,16 +1575,109 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 #else
 		flushlsn = GetFlushRecPtr();
 #endif
-		if (lsn > flushlsn)
+		if (last_written_lsn > flushlsn)
 		{
 			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-				 (uint32) (lsn >> 32), (uint32) lsn,
-				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
-			XLogFlush(lsn);
+					 LSN_FORMAT_ARGS(last_written_lsn),
+					 LSN_FORMAT_ARGS(flushlsn));
+			XLogFlush(last_written_lsn);
+			flushlsn = last_written_lsn;
 		}
+
+		/*
+		 * Request the latest version of the page. The most up-to-date request
+		 * LSN we could use would be the current insert LSN, but to avoid the
+		 * overhead of looking it up, use 'flushlsn' instead. This relies on
+		 * the assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we wouldn't be
+		 * requesting it.
+		 */
+		*request_lsn = flushlsn;
+		*not_modified_since = last_written_lsn;
+	}
+}
+
+/*
+ *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
+ *
+ * This is used to check if the response to a prefetch request can be used to
+ * satisfy a page read now.
+ */
+static bool
+neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+							  PrefetchRequest *slot)
+{
+	/* sanity check the LSN's on the old and the new request */
+	Assert(request_lsn >= not_modified_since);
+	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(slot->status != PRFS_UNUSED);
+
+	/*
+	 * The new request's LSN should never be older than the old one.  This
+	 * could be an Assert, except that for testing purposes, we do provide an
+	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
+	 * violates this.
+	 *
+	 * Similarly, the not_modified_since value calculated for a page should
+	 * never move backwards. This assumption is a bit fragile; if we updated
+	 * the last-written cache when we read in a page, for example, then it
+	 * might. But as the code stands, it should not.
+	 *
+	 * (If two backends issue a request at the same time, they might race and
+	 * calculate LSNs "out of order" with each other, but the prefetch queue
+	 * is backend-private at the moment.)
+	 */
+	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_IO_ERROR),
+				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
+				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
+						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+		return false;
 	}
 
-	return lsn;
+	/*---
+	 * Each request to the pageserver carries two LSN values:
+	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
+	 * request_lsn] range of each request is effectively a claim that the page
+	 * has not been modified between those LSNs.  If the range of the old
+	 * request in the queue overlaps with the new request, we know that the
+	 * page hasn't been modified in the union of the ranges. We can use the
+	 * response to old request to satisfy the new request in that case. For
+	 * example:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                     400      800
+	 * New request:         +--------+
+	 *
+	 * The old request claims that the page was not modified between LSNs 100
+	 * and 500, and the second claims that it was not modified between 400 and
+	 * 800. Together they mean that the page was not modified between 100 and
+	 * 800. Therefore the response to the old request is also valid for the
+	 * new request.
+	 *
+	 * This logic also holds at the boundary case that the old request's LSN
+	 * matches the new request's not_modified_since LSN exactly:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                       500      900
+	 * New request:           +--------+
+	 *
+	 * The response to the old request is the page as it was at LSN 500, and
+	 * the page hasn't been changed in the range (500, 900], therefore the
+	 * response is valid also for the new request.
+	 */
+
+	/* this follows from the checks above */
+	Assert(request_lsn >= slot->not_modified_since);
+
+	return not_modified_since <= slot->request_lsn;
 }
 
 /*
@@ -1604,8 +1689,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1660,12 +1745,13 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 		.forknum = forkNum};
 
@@ -2102,10 +2188,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2148,15 +2234,16 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (slot->effective_request_lsn >= request_lsn)
+		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else					/* the current prefetch LSN is not large
-								 * enough, so drop the prefetch */
+		else
 		{
 			/*
+			 * Cannot use this prefetch, discard it
+			 *
 			 * We can't drop cache for not-yet-received requested items. It is
 			 * unlikely this happens, but it can happen if prefetch distance
 			 * is large enough and a backend didn't consume all prefetch
@@ -2181,8 +2268,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_latest,
-												  &request_lsn);
+			ring_index = prefetch_register_buffer(buftag, &request_lsn,
+												  &not_modified_since);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2246,8 +2333,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2272,8 +2359,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
+						 &request_lsn, &not_modified_since);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2442,8 +2530,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2470,12 +2558,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2523,16 +2612,17 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn;
-	bool		latest;
+	XLogRecPtr	request_lsn,
+				not_modified_since;
 	NRelFileInfo dummy_node = {0};
 
-	request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.dbNode = dbNode,
 		};
 
@@ -2605,7 +2695,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * the most recently inserted WAL record's LSN.
 	 */
 	lsn = GetXLogInsertRecPtr();
-
 	lsn = nm_adjust_lsn(lsn);
 
 	/*
@@ -2805,14 +2894,33 @@ neon_end_unlogged_build(SMgrRelation reln)
 static int
 neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
 {
-	XLogRecPtr request_lsn;
-	/*
-	 * GetRedoStartLsn() returns LSN of basebackup.
-	 * We need to download SLRU segments only once after node startup,
-	 * then SLRUs are maintained locally.
-	 */
-	request_lsn = GetRedoStartLsn();
+	XLogRecPtr request_lsn,
+		not_modified_since;
+
+	if (RecoveryInProgress())
+	{
+		request_lsn = GetXLogReplayRecPtr(NULL);
+		if (request_lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * This happens in neon startup, we start up without replaying any
+			 * records.
+			 */
+			request_lsn = GetRedoStartLsn();
+		}
+	}
+	else
+		request_lsn = GetXLogInsertRecPtr();
 	request_lsn = nm_adjust_lsn(request_lsn);
+
+	/*
+	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
+	 * segment has not changed since the basebackup, because in order to
+	 * modify it, we would have had to download it already. And once
+	 * downloaded, we never evict SLRU segments from local disk.
+	 */
+	not_modified_since = GetRedoStartLsn();
+
 	SlruKind kind;
 
     if (STRPREFIX(path, "pg_xact"))
@@ -2827,8 +2935,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	NeonResponse *resp;
 	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
-		.req.latest = false,
 		.req.lsn = request_lsn,
+		.req.not_modified_since = not_modified_since,
 
 		.kind = kind,
 		.segno = segno
@@ -2956,6 +3064,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 {
 	BlockNumber relsize;
 
+	/* This is only used in WAL replay */
+	Assert(RecoveryInProgress());
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -2974,14 +3085,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * This length is later reused when we open the smgr to read the
 		 * block, which is fine and expected.
 		 */
-
 		NeonResponse *response;
 		NeonNblocksResponse *nbresponse;
 		NeonNblocksRequest request = {
 			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
 				.tag = T_NeonNblocksRequest,
+				.lsn = end_recptr,
+				.not_modified_since = end_recptr,
 			},
 			.rinfo = rinfo,
 			.forknum = forknum,
diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 9c774ec185..1ee87357e5 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.0.sql
+DATA = neon_test_utils--1.1.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
similarity index 89%
rename from pgxn/neon_test_utils/neon_test_utils--1.0.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.1.sql
index 23340e352e..534784f319 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
@@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
-CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
 LANGUAGE C PARALLEL UNSAFE;
 
-CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5219571f11..5f6d640835 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 82ce5be9f6..677006923d 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
  */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, bool request_latest, char *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, bool request_latest, void *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 #endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -299,8 +299,11 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *forkname;
 	uint32		blkno;
 
-	bool		request_latest = PG_ARGISNULL(3);
-	uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
+
+	if (PG_NARGS() != 5)
+		elog(ERROR, "unexpected number of arguments in SQL function signature");
 
 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();
@@ -309,6 +312,9 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);
 
+	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -361,7 +367,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);
 
-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 
 	relation_close(rel, AccessShareLock);
 
@@ -380,6 +386,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 {
 	char	   *raw_page_data;
 
+	if (PG_NARGS() != 7)
+		elog(ERROR, "unexpected number of arguments in SQL function signature");
+
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -403,18 +412,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		};
 
 		ForkNumber	forknum = PG_GETARG_UINT32(3);
-
 		uint32		blkno = PG_GETARG_UINT32(4);
-		bool		request_latest = PG_ARGISNULL(5);
-		uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
+		XLogRecPtr	request_lsn;
+		XLogRecPtr	not_modified_since;
 
 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
 
+		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);
 
-		neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 868b80a561..2437c8f806 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -17,7 +17,14 @@ def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_read_validation", "empty")
 
-    endpoint = env.endpoints.create_start("test_read_validation")
+    endpoint = env.endpoints.create_start(
+        "test_read_validation",
+        # Use protocol version 2, because the code that constructs the V1 messages
+        # assumes that a primary always wants to read the latest version of a page,
+        # and therefore doesn't work with the test functions below to read an older
+        # page version.
+        config_lines=["neon.protocol_version=2"],
+    )
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -64,7 +71,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Cache is clear, reading stale page version")
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn"
@@ -77,7 +84,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Cache is clear, reading latest page version without cache")
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -92,7 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -102,7 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -114,7 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -133,7 +140,14 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
 
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    endpoint = env.endpoints.create_start("test_read_validation_neg")
+    endpoint = env.endpoints.create_start(
+        "test_read_validation_neg",
+        # Use protocol version 2, because the code that constructs the V1 messages
+        # assumes that a primary always wants to read the latest version of a page,
+        # and therefore doesn't work with the test functions below to read an older
+        # page version.
+        config_lines=["neon.protocol_version=2"],
+    )
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -143,7 +157,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("read a page of a missing relation")
             try:
                 c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))"
                 )
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
@@ -155,7 +169,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("read a page at lsn 0")
             try:
                 c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))"
                 )
                 raise AssertionError("query should have failed")
             except IoError as e:
@@ -164,22 +178,22 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("Pass NULL as an input")
             expected = (None, None, None)
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             # This check is currently failing, reading beyond EOF is returning a 0-page
             log.info("Read beyond EOF")
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))"
             )
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index eff103ca09..06f2a8befd 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -173,7 +173,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     # which changes the LSN on the page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
     vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
-    cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
+    cur.execute(
+        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
+    )
     vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache

From 0397427dcf9de7d16ede744700b6d87c84ebfd46 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:48 +0300
Subject: [PATCH 083/157] Add test for SLRU download (#7377)

Before PR #7377, on-demand SLRU download always used the basebackup's
LSN in the SLRU download, but that LSN might get garbage-collected away
in the pageserver. We should request the latest LSN, like with GetPage
requests, with the LSN just indicating that we know that the page hasn't
been changed since the LSN (since the basebackup in this case).

Add test to demonstrate the problem. Without the fix, it fails with
"tried to request a page version that was garbage collected" error from
the pageserver.

I wrote this test as part of earlier PR #6693, but that fell through
the cracks and was never applied. PR #7377 superseded the fix from
that older PR, but the test is still valid.
---
 .../regress/test_ondemand_slru_download.py    | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 test_runner/regress/test_ondemand_slru_download.py

diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
new file mode 100644
index 0000000000..0b36b32552
--- /dev/null
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -0,0 +1,131 @@
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar
+
+
+#
+# Test on-demand download of the pg_xact SLRUs
+#
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
+    )
+
+    timeline_id = env.initial_timeline
+    tenant_id = env.initial_tenant
+    endpoint = env.endpoints.create_start("main")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Consume a lot of XIDs, to create more pg_xact segments
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (2)")
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (2)")
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (3)")
+
+    # Restart postgres. After restart, the new instance will download the
+    # pg_xact segments lazily.
+    endpoint.stop()
+    endpoint.start()
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Consume more WAL, so that the pageserver can compact and GC older data,
+    # including the LSN that we started the new endpoint at,
+    cur.execute("CREATE TABLE anothertable (i int, t text)")
+    cur.execute(
+        "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g"
+    )
+
+    # Run GC
+    shards = tenant_get_shards(env, tenant_id, None)
+    for tenant_shard_id, pageserver in shards:
+        client = pageserver.http_client()
+        client.timeline_checkpoint(tenant_shard_id, timeline_id)
+        client.timeline_compact(tenant_shard_id, timeline_id)
+        client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+    # Test that this can still on-demand download the old pg_xact segments
+    cur.execute("select xmin, xmax, * from clogtest")
+    tup = cur.fetchall()
+    log.info(f"tuples = {tup}")
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
+    )
+
+    endpoint = env.endpoints.create_start("main")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Consume a lot of XIDs, to create more pg_xact segments
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+
+    # Open a new connection and insert another row, but leave
+    # the transaction open
+    pg_conn2 = endpoint.connect()
+    cur2 = pg_conn2.cursor()
+    cur2.execute("BEGIN")
+    cur2.execute("INSERT INTO clogtest VALUES (2)")
+
+    # Another insert on the first connection, which is committed.
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (3)")
+
+    # Start standby at this point in time
+    lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
+    endpoint_at_lsn = env.endpoints.create_start(
+        branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
+    )
+
+    # Commit transaction 2, after the standby was launched.
+    cur2.execute("COMMIT")
+
+    # The replica should not see transaction 2 as committed.
+    conn_replica = endpoint_at_lsn.connect()
+    cur_replica = conn_replica.cursor()
+    cur_replica.execute("SELECT * FROM clogtest")
+    assert cur_replica.fetchall() == [(1,), (3,)]

From ca8fca0e9ff87b0dfdc776fd1806bd04238954a4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 25 Apr 2024 19:45:52 +0300
Subject: [PATCH 084/157] Add test to demonstrate the problem with protocol
 version 1 (#7377)

---
 test_runner/regress/test_hot_standby.py | 79 ++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index ac3315b86f..179cc273ec 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -3,7 +3,7 @@ import re
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -102,3 +102,80 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
             ) as secondary2:
                 wait_replica_caughtup(primary, secondary1)
                 wait_replica_caughtup(primary, secondary2)
+
+
+# We had an issue that a standby server made GetPage requests with an
+# old LSN, based on the last-written LSN cache, to avoid waits in the
+# pageserver.  However, requesting a page with a very old LSN, such
+# that the GC horizon has already advanced past it, results in an
+# error from the pageserver:
+# "Bad request: tried to request a page version that was garbage collected"
+#
+# To avoid that, the compute<-> pageserver protocol was updated so
+# that that the standby now sends two LSNs, the old last-written LSN
+# and the current replay LSN.
+#
+# https://github.com/neondatabase/neon/issues/6211
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+    tenant_conf = {
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+    timeline_id = env.initial_timeline
+    tenant_id = env.initial_tenant
+
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            # Protocol version 2 was introduced to fix the issue
+            # that this test exercises. With protocol version 1 it
+            # fails.
+            config_lines=["neon.protocol_version=2"],
+        ) as secondary:
+            p_cur = primary.connect().cursor()
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)")
+            p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g")
+
+            wait_replica_caughtup(primary, secondary)
+
+            s_cur = secondary.connect().cursor()
+
+            s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
+            res = s_cur.fetchone()
+            assert res is not None
+
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            res = s_cur.fetchone()
+            assert res[0] == 10000
+
+            # Clear the cache in the standby, so that when we
+            # re-execute the query, it will make GetPage
+            # requests. This does not clear the last-written LSN cache
+            # so we still remember the LSNs of the pages.
+            s_cur.execute("SELECT clear_buffer_cache()")
+
+            # Do other stuff on the primary, to advance the WAL
+            p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
+
+            # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
+            # very close to the primary's current insert LSN.
+            shards = tenant_get_shards(env, tenant_id, None)
+            for tenant_shard_id, pageserver in shards:
+                client = pageserver.http_client()
+                client.timeline_checkpoint(tenant_shard_id, timeline_id)
+                client.timeline_compact(tenant_shard_id, timeline_id)
+                client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+            # Re-execute the query. The GetPage requests that this
+            # generates use old not_modified_since LSNs, older than
+            # the GC cutoff, but new request LSNs. (In protocol
+            # version 1 there was only one LSN, and this failed.)
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            res = s_cur.fetchone()
+            assert res[0] == 10000

From d63185fa6c05dc7ba5dba8d11bb84788c50e288f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Apr 2024 09:15:59 +0100
Subject: [PATCH 085/157] storage controller: log hygiene & better error type
 (#7508)

These are testability/logging improvements spun off from #7475

- Don't log warnings for shutdown errors in compute hook
- Revise logging around heartbeats and reconcile_all so that we aren't
emitting such a large volume of INFO messages under normal quite
conditions.
- Clean up the `last_error` of TenantShard to hold a ReconcileError
instead of a String, and use that properly typed error to suppress
reconciler cancel errors during reconcile_all_now. This is important for
tests that iteratively call that, as otherwise they would get 500 errors
when some reconciler in flight was cancelled (perhaps due to a state
change on the tenant shard starting a new reconciler).
---
 storage_controller/src/heartbeater.rs  | 13 +++++++++
 storage_controller/src/reconciler.rs   |  5 +++-
 storage_controller/src/service.rs      | 35 ++++++++++++++++++----
 storage_controller/src/tenant_shard.rs | 40 +++++++++++++++++++-------
 4 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 7669680eb6..1ef97e78eb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -184,6 +184,19 @@ impl HeartbeaterTask {
                 }
             }
         }
+        tracing::info!(
+            "Heartbeat round complete for {} nodes, {} offline",
+            new_state.len(),
+            new_state
+                .values()
+                .filter(|s| match s {
+                    PageserverState::Available { .. } => {
+                        false
+                    }
+                    PageserverState::Offline => true,
+                })
+                .count()
+        );
 
         let mut deltas = Vec::new();
         let now = Instant::now();
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 28801ede6e..f38905b424 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -767,7 +767,10 @@ impl Reconciler {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                if !matches!(e, NotifyError::ShuttingDown) {
+                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                }
+
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2e6f3750e7..952664e339 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -824,8 +824,7 @@ impl Service {
 
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
-                *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                tenant.error_waiter.advance(result.sequence);
+                tenant.set_last_error(result.sequence, e);
 
                 for (node_id, o) in result.observed.locations {
                     tenant.observed.locations.insert(node_id, o);
@@ -2805,7 +2804,14 @@ impl Service {
                 tenant_shard_id: shard.tenant_shard_id,
                 node_attached: *shard.intent.get_attached(),
                 node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard.last_error.lock().unwrap().clone(),
+                last_error: shard
+                    .last_error
+                    .lock()
+                    .unwrap()
+                    .as_ref()
+                    .map(|e| format!("{e}"))
+                    .unwrap_or("".to_string())
+                    .clone(),
                 is_reconciling: shard.reconciler.is_some(),
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
@@ -4031,7 +4037,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::info!("Node {} no change during config", node_id);
+                tracing::debug!("Node {} no change during config", node_id);
             }
         }
 
@@ -4351,7 +4357,26 @@ impl Service {
         };
 
         let waiter_count = waiters.len();
-        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+        match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
+            Ok(()) => {}
+            Err(ReconcileWaitError::Failed(_, reconcile_error))
+                if matches!(*reconcile_error, ReconcileError::Cancel) =>
+            {
+                // Ignore reconciler cancel errors: this reconciler might have shut down
+                // because some other change superceded it.  We will return a nonzero number,
+                // so the caller knows they might have to call again to quiesce the system.
+            }
+            Err(e) => {
+                return Err(e);
+            }
+        };
+
+        tracing::info!(
+            "{} reconciles in reconcile_all, {} waiters",
+            reconciles_spawned,
+            waiter_count
+        );
+
         Ok(waiter_count)
     }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index d69260b9e7..7b11dfe64d 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -38,12 +38,18 @@ use crate::{
 };
 
 /// Serialization helper
-fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
+fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
 where
     S: serde::ser::Serializer,
-    T: Clone + std::fmt::Display,
+    T: std::fmt::Display,
 {
-    serializer.collect_str(&v.lock().unwrap())
+    serializer.collect_str(
+        &v.lock()
+            .unwrap()
+            .as_ref()
+            .map(|e| format!("{e}"))
+            .unwrap_or("".to_string()),
+    )
 }
 
 /// In-memory state for a particular tenant shard.
@@ -111,11 +117,15 @@ pub(crate) struct TenantShard {
     #[serde(skip)]
     pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
-    /// The most recent error from a reconcile on this tenant
+    /// The most recent error from a reconcile on this tenant.  This is a nested Arc
+    /// because:
+    ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
+    ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
+    ///    many waiters for one shard, and the underlying error types are not Clone.
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    #[serde(serialize_with = "read_mutex_content")]
-    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+    #[serde(serialize_with = "read_last_error")]
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
     /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -293,18 +303,18 @@ pub(crate) struct ReconcilerWaiter {
 
     seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
     error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error: std::sync::Arc<std::sync::Mutex<String>>,
+    error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
     seq: Sequence,
 }
 
 #[derive(thiserror::Error, Debug)]
-pub enum ReconcileWaitError {
+pub(crate) enum ReconcileWaitError {
     #[error("Timeout waiting for shard {0}")]
     Timeout(TenantShardId),
     #[error("shutting down")]
     Shutdown,
     #[error("Reconcile error on shard {0}: {1}")]
-    Failed(TenantShardId, String),
+    Failed(TenantShardId, Arc<ReconcileError>),
 }
 
 #[derive(Eq, PartialEq, Debug)]
@@ -342,7 +352,8 @@ impl ReconcilerWaiter {
                     SeqWaitError::Timeout => unreachable!()
                 })?;
 
-                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id,
+                    self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone()))
             }
         }
 
@@ -873,7 +884,7 @@ impl TenantShard {
             active_nodes_dirty || dirty_observed || self.pending_compute_notification;
 
         if !do_reconcile {
-            tracing::info!("Not dirty, no reconciliation needed.");
+            tracing::debug!("Not dirty, no reconciliation needed.");
             return ReconcileNeeded::No;
         }
 
@@ -1151,6 +1162,13 @@ impl TenantShard {
         &self.scheduling_policy
     }
 
+    pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) {
+        // Ordering: always set last_error before advancing sequence, so that sequence
+        // waiters are guaranteed to see a Some value when they see an error.
+        *(self.last_error.lock().unwrap()) = Some(Arc::new(error));
+        self.error_waiter.advance(sequence);
+    }
+
     pub(crate) fn from_persistent(
         tsp: TenantShardPersistence,
         intent: IntentState,

From 70f4a16a05a5512c250102600f7900169b15c56d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 10:30:20 +0200
Subject: [PATCH 086/157] refactor(owned_buffers_io::BufferedWriter): be
 generic over the type of buffer (#7482)

---
 .../tenant/remote_timeline_client/download.rs |   9 +-
 .../virtual_file/owned_buffers_io/write.rs    | 147 +++++++++++++-----
 2 files changed, 110 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 84692aa577..7bf2d2de10 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -7,6 +7,7 @@ use std::collections::HashSet;
 use std::future::Future;
 
 use anyhow::{anyhow, Context};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
@@ -194,10 +195,10 @@ async fn download_object<'a>(
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
                     let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
-                        { super::BUFFER_SIZE },
-                        _,
-                    >::new(size_tracking);
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
+                        size_tracking,
+                        BytesMut::with_capacity(super::BUFFER_SIZE),
+                    );
                     while let Some(res) =
                         futures::StreamExt::next(&mut download.download_stream).await
                     {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index f1812d9b51..6b3a02c71a 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -10,14 +10,14 @@ pub trait OwnedAsyncWriter {
     ) -> std::io::Result<(usize, B::Buf)>;
 }
 
-/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
-/// into `BUFFER_SIZE`-sized writes.
+/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
+/// small writes into larger writes of size [`Buffer::cap`].
 ///
 /// # Passthrough Of Large Writers
 ///
-/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
-/// buffer to be flushed, even if it is not full yet. Then, the large
-/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
+/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
+/// cause the internal buffer to be flushed prematurely so that the large
+/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
 ///
 /// This pass-through is generally beneficial for throughput, but if
 /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
@@ -25,24 +25,25 @@ pub trait OwnedAsyncWriter {
 ///
 /// In such cases, a different implementation that always buffers in memory
 /// may be preferable.
-pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
+pub struct BufferedWriter<B, W> {
     writer: W,
-    // invariant: always remains Some(buf)
-    // with buf.capacity() == BUFFER_SIZE except
-    // - while IO is ongoing => goes back to Some() once the IO completed successfully
-    // - after an IO error => stays `None` forever
-    // In these exceptional cases, it's `None`.
-    buf: Option<BytesMut>,
+    /// invariant: always remains Some(buf) except
+    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
+    /// - after an IO error => stays `None` forever
+    /// In these exceptional cases, it's `None`.
+    buf: Option<B>,
 }
 
-impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
+impl<B, Buf, W> BufferedWriter<B, W>
 where
+    B: Buffer<IoBuf = Buf> + Send,
+    Buf: IoBuf + Send,
     W: OwnedAsyncWriter,
 {
-    pub fn new(writer: W) -> Self {
+    pub fn new(writer: W, buf: B) -> Self {
         Self {
             writer,
-            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
+            buf: Some(buf),
         }
     }
 
@@ -53,61 +54,121 @@ where
         Ok(writer)
     }
 
-    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
+    #[inline(always)]
+    fn buf(&self) -> &B {
+        self.buf
+            .as_ref()
+            .expect("must not use after we returned an error")
+    }
+
+    pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
     where
-        B: IoBuf + Send,
+        S: IoBuf + Send,
     {
+        let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
-        if chunk.len() >= BUFFER_SIZE {
+        if chunk.len() >= self.buf().cap() {
             self.flush().await?;
             // do a big write, bypassing `buf`
             assert_eq!(
                 self.buf
                     .as_ref()
                     .expect("must not use after an error")
-                    .len(),
+                    .pending(),
                 0
             );
-            let chunk_len = chunk.len();
             let (nwritten, chunk) = self.writer.write_all(chunk).await?;
             assert_eq!(nwritten, chunk_len);
-            drop(chunk);
-            return Ok(());
+            return Ok((nwritten, chunk));
         }
         // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < BUFFER_SIZE);
-        let mut chunk = &chunk[..];
-        while !chunk.is_empty() {
+        assert!(chunk.len() < self.buf().cap());
+        let mut slice = &chunk[..];
+        while !slice.is_empty() {
             let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = BUFFER_SIZE - buf.len();
-            let have = chunk.len();
+            let need = buf.cap() - buf.pending();
+            let have = slice.len();
             let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&chunk[..n]);
-            chunk = &chunk[n..];
-            if buf.len() >= BUFFER_SIZE {
-                assert_eq!(buf.len(), BUFFER_SIZE);
+            buf.extend_from_slice(&slice[..n]);
+            slice = &slice[n..];
+            if buf.pending() >= buf.cap() {
+                assert_eq!(buf.pending(), buf.cap());
                 self.flush().await?;
             }
         }
-        assert!(chunk.is_empty(), "by now we should have drained the chunk");
-        Ok(())
+        assert!(slice.is_empty(), "by now we should have drained the chunk");
+        Ok((chunk_len, chunk.into_inner()))
     }
 
     async fn flush(&mut self) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
-        if buf.is_empty() {
+        let buf_len = buf.pending();
+        if buf_len == 0 {
             self.buf = Some(buf);
-            return std::io::Result::Ok(());
+            return Ok(());
         }
-        let buf_len = buf.len();
-        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
+        let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
         assert_eq!(nwritten, buf_len);
-        buf.clear();
-        self.buf = Some(buf);
+        self.buf = Some(Buffer::reuse_after_flush(io_buf));
         Ok(())
     }
 }
 
+/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
+pub trait Buffer {
+    type IoBuf: IoBuf;
+
+    /// Capacity of the buffer. Must not change over the lifetime `self`.`
+    fn cap(&self) -> usize;
+
+    /// Add data to the buffer.
+    /// Panics if there is not enough room to accomodate `other`'s content, i.e.,
+    /// panics if `other.len() > self.cap() - self.pending()`.
+    fn extend_from_slice(&mut self, other: &[u8]);
+
+    /// Number of bytes in the buffer.
+    fn pending(&self) -> usize;
+
+    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
+    /// so we can use [`tokio_epoll_uring`] to write it to disk.
+    fn flush(self) -> Slice<Self::IoBuf>;
+
+    /// After the write to disk is done and we have gotten back the slice,
+    /// [`BufferedWriter`] uses this method to re-use the io buffer.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
+}
+
+impl Buffer for BytesMut {
+    type IoBuf = BytesMut;
+
+    #[inline(always)]
+    fn cap(&self) -> usize {
+        self.capacity()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        BytesMut::extend_from_slice(self, other)
+    }
+
+    #[inline(always)]
+    fn pending(&self) -> usize {
+        self.len()
+    }
+
+    fn flush(self) -> Slice<BytesMut> {
+        if self.is_empty() {
+            return self.slice_full();
+        }
+        let len = self.len();
+        self.slice(0..len)
+    }
+
+    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
+        iobuf.clear();
+        iobuf
+    }
+}
+
 impl OwnedAsyncWriter for Vec<u8> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
@@ -125,6 +186,8 @@ impl OwnedAsyncWriter for Vec<u8> {
 
 #[cfg(test)]
 mod tests {
+    use bytes::BytesMut;
+
     use super::*;
 
     #[derive(Default)]
@@ -158,7 +221,7 @@ mod tests {
     #[tokio::test]
     async fn test_buffered_writes_only() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"a");
         write!(writer, b"b");
         write!(writer, b"c");
@@ -175,7 +238,7 @@ mod tests {
     #[tokio::test]
     async fn test_passthrough_writes_only() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"abc");
         write!(writer, b"de");
         write!(writer, b"");
@@ -191,7 +254,7 @@ mod tests {
     #[tokio::test]
     async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"a");
         write!(writer, b"bc");
         write!(writer, b"d");

From bf369f4268f839b5228dd1d65d822280d50401c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 11:19:41 +0200
Subject: [PATCH 087/157] 
 refactor(owned_buffer_io::util::size_tracking_writer): make generic over
 underlying writer (#7483)

part of https://github.com/neondatabase/neon/issues/7124
---
 .../tenant/remote_timeline_client/download.rs |  1 +
 pageserver/src/virtual_file.rs                | 12 +++++++++++
 .../util/size_tracking_writer.rs              | 21 +++++++++++--------
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 7bf2d2de10..3744eecab5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -183,6 +183,7 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
+            use bytes::BytesMut;
             async {
                 let destination_file = VirtualFile::create(dst_path)
                     .await
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 0cf6a0019b..1d43a94568 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -32,6 +32,7 @@ pub use io_engine::feature_test as io_engine_feature_test;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
+use self::owned_buffers_io::write::OwnedAsyncWriter;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
@@ -1083,6 +1084,17 @@ impl Drop for VirtualFile {
     }
 }
 
+impl OwnedAsyncWriter for VirtualFile {
+    #[inline(always)]
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let (buf, res) = VirtualFile::write_all(self, buf).await;
+        res.map(move |v| (v, buf))
+    }
+}
+
 impl OpenFiles {
     fn new(num_slots: usize) -> OpenFiles {
         let mut slots = Box::new(Vec::with_capacity(num_slots));
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 7505b7487e..edb11c5f4c 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,33 +1,36 @@
-use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
+use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
 use tokio_epoll_uring::{BoundedBuf, IoBuf};
 
-pub struct Writer {
-    dst: VirtualFile,
+pub struct Writer<W> {
+    dst: W,
     bytes_amount: u64,
 }
 
-impl Writer {
-    pub fn new(dst: VirtualFile) -> Self {
+impl<W> Writer<W> {
+    pub fn new(dst: W) -> Self {
         Self {
             dst,
             bytes_amount: 0,
         }
     }
+
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
-    pub fn into_inner(self) -> (u64, VirtualFile) {
+    pub fn into_inner(self) -> (u64, W) {
         (self.bytes_amount, self.dst)
     }
 }
 
-impl OwnedAsyncWriter for Writer {
+impl<W> OwnedAsyncWriter for Writer<W>
+where
+    W: OwnedAsyncWriter,
+{
     #[inline(always)]
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = self.dst.write_all(buf).await;
-        let nwritten = res?;
+        let (nwritten, buf) = self.dst.write_all(buf).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
     }

From dbb0c967d5fb5104847fb71e8d783ebeae3e7ff2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 13:01:26 +0200
Subject: [PATCH 088/157] refactor(ephemeral_file): reuse
 owned_buffers_io::BufferedWriter (#7484)

part of https://github.com/neondatabase/neon/issues/7124

Changes
-------

This PR replaces the `EphemeralFile::write_blob`-specifc `struct Writer`
with re-use of `owned_buffers_io::write::BufferedWriter`.

Further, it restructures the code to cleanly separate

* the high-level aspect of EphemeralFile's write_blob / read_blk API
* the page-caching aspect
* the aspect of IO
  * performing buffered write IO to an underlying VirtualFile
* serving reads from either the VirtualFile or the buffer if it hasn't
been flushed yet
* the annoying "feature" that reads past the end of the written range
are allowed and expected to return zeroed memory, as long as one remains
within one PAGE_SZ
---
 pageserver/src/task_mgr.rs                    |   2 +
 pageserver/src/tenant/ephemeral_file.rs       | 223 ++----------------
 .../src/tenant/ephemeral_file/page_caching.rs | 218 +++++++++++++++++
 .../ephemeral_file/zero_padded_read_write.rs  | 125 ++++++++++
 .../zero_padded_read_write/zero_padded.rs     | 108 +++++++++
 .../tenant/remote_timeline_client/download.rs |   1 -
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +-
 pageserver/src/virtual_file.rs                |   1 -
 .../util/size_tracking_writer.rs              |   8 +
 .../virtual_file/owned_buffers_io/write.rs    |  58 +++++
 10 files changed, 538 insertions(+), 208 deletions(-)
 create mode 100644 pageserver/src/tenant/ephemeral_file/page_caching.rs
 create mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
 create mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 9a1e354ecf..b76105399b 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -361,6 +361,8 @@ pub enum TaskKind {
 
     DebugTool,
 
+    EphemeralFilePreWarmPageCache,
+
     #[cfg(test)]
     UnitTest,
 }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index b27230db03..96efd13c1b 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,36 +3,26 @@
 
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
+use crate::page_cache;
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use std::cmp::min;
 
-use std::io::{self, ErrorKind};
-use std::ops::DerefMut;
+use std::io;
 use std::sync::atomic::AtomicU64;
-use tracing::*;
 use utils::id::TimelineId;
 
 pub struct EphemeralFile {
-    page_cache_file_id: page_cache::FileId,
-
     _tenant_shard_id: TenantShardId,
     _timeline_id: TimelineId,
-    file: VirtualFile,
-    len: u64,
-    /// An ephemeral file is append-only.
-    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
-    /// The other pages, which can no longer be modified, are accessed through the page cache.
-    ///
-    /// None <=> IO is ongoing.
-    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
-    mutable_tail: Option<BytesMut>,
+
+    rw: page_caching::RW,
 }
 
+mod page_caching;
+mod zero_padded_read_write;
+
 impl EphemeralFile {
     pub async fn create(
         conf: &PageServerConf,
@@ -59,21 +49,18 @@ impl EphemeralFile {
         .await?;
 
         Ok(EphemeralFile {
-            page_cache_file_id: page_cache::next_file_id(),
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            file,
-            len: 0,
-            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
+            rw: page_caching::RW::new(file),
         })
     }
 
     pub(crate) fn len(&self) -> u64 {
-        self.len
+        self.rw.bytes_written()
     }
 
-    pub(crate) fn id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
+    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.rw.page_cache_file_id()
     }
 
     pub(crate) async fn read_blk(
@@ -81,182 +68,30 @@ impl EphemeralFile {
         blknum: u32,
         ctx: &RequestContext,
     ) -> Result<BlockLease, io::Error> {
-        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
-        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        // order path before error because error is anyhow::Error => might have many contexts
-                        format!(
-                            "ephemeral file: read immutable page #{}: {}: {:#}",
-                            blknum, self.file.path, e,
-                        ),
-                    )
-                })? {
-                page_cache::ReadBufResult::Found(guard) => {
-                    return Ok(BlockLease::PageReadGuard(guard))
-                }
-                page_cache::ReadBufResult::NotFound(write_guard) => {
-                    let write_guard = self
-                        .file
-                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
-                        .await?;
-                    let read_guard = write_guard.mark_valid();
-                    return Ok(BlockLease::PageReadGuard(read_guard));
-                }
-            };
-        } else {
-            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(
-                self.mutable_tail
-                    .as_deref()
-                    .expect("we're not doing IO, it must be Some()")
-                    .try_into()
-                    .expect("we ensure that it's always PAGE_SZ"),
-            ))
-        }
+        self.rw.read_blk(blknum, ctx).await
     }
 
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
     ) -> Result<u64, io::Error> {
-        struct Writer<'a> {
-            ephemeral_file: &'a mut EphemeralFile,
-            /// The block to which the next [`push_bytes`] will write.
-            blknum: u32,
-            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
-            off: usize,
-        }
-        impl<'a> Writer<'a> {
-            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
-                Ok(Writer {
-                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
-                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
-                    ephemeral_file,
-                })
-            }
-            #[inline(always)]
-            async fn push_bytes(
-                &mut self,
-                src: &[u8],
-                ctx: &RequestContext,
-            ) -> Result<(), io::Error> {
-                let mut src_remaining = src;
-                while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self
-                        .ephemeral_file
-                        .mutable_tail
-                        .as_deref_mut()
-                        .expect("IO is not yet ongoing")[self.off..];
-                    let n = min(dst_remaining.len(), src_remaining.len());
-                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
-                    self.off += n;
-                    src_remaining = &src_remaining[n..];
-                    if self.off == PAGE_SZ {
-                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
-                            .expect("IO is not yet ongoing");
-                        let (mutable_tail, res) = self
-                            .ephemeral_file
-                            .file
-                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
-                            .await;
-                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
-                        // I.e., the IO isn't retryable if we panic.
-                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
-                        match res {
-                            Ok(_) => {
-                                // Pre-warm the page cache with what we just wrote.
-                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
-                                let cache = page_cache::get();
-                                match cache
-                                    .read_immutable_buf(
-                                        self.ephemeral_file.page_cache_file_id,
-                                        self.blknum,
-                                        ctx,
-                                    )
-                                    .await
-                                {
-                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
-                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
-                                    }
-                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
-                                        let buf: &mut [u8] = write_guard.deref_mut();
-                                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(
-                                            self.ephemeral_file
-                                                .mutable_tail
-                                                .as_deref()
-                                                .expect("IO is not ongoing"),
-                                        );
-                                        let _ = write_guard.mark_valid();
-                                        // pre-warm successful
-                                    }
-                                    Err(e) => {
-                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                                    }
-                                }
-                                // Zero the buffer for re-use.
-                                // Zeroing is critical for correcntess because the write_blob code below
-                                // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file
-                                    .mutable_tail
-                                    .as_deref_mut()
-                                    .expect("IO is not ongoing")
-                                    .fill(0);
-                                // This block is done, move to next one.
-                                self.blknum += 1;
-                                self.off = 0;
-                            }
-                            Err(e) => {
-                                return Err(std::io::Error::new(
-                                    ErrorKind::Other,
-                                    // order error before path because path is long and error is short
-                                    format!(
-                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
-                                        self.blknum,
-                                        e,
-                                        self.ephemeral_file.file.path,
-                                    ),
-                                ));
-                            }
-                        }
-                    }
-                }
-                Ok(())
-            }
-        }
-
-        let pos = self.len;
-        let mut writer = Writer::new(self)?;
+        let pos = self.rw.bytes_written();
 
         // Write the length field
         if srcbuf.len() < 0x80 {
             // short one-byte length header
             let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf, ctx).await?;
+
+            self.rw.write_all_borrowed(&len_buf).await?;
         } else {
             let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
             len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf, ctx).await?;
+            self.rw.write_all_borrowed(&len_buf).await?;
         }
 
         // Write the payload
-        writer.push_bytes(srcbuf, ctx).await?;
-
-        if srcbuf.len() < 0x80 {
-            self.len += 1;
-        } else {
-            self.len += 4;
-        }
-        self.len += srcbuf.len() as u64;
+        self.rw.write_all_borrowed(srcbuf).await?;
 
         Ok(pos)
     }
@@ -271,28 +106,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
     }
 }
 
-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        let res = std::fs::remove_file(&self.file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.file.path, e
-                );
-            }
-        }
-    }
-}
-
 impl BlockReader for EphemeralFile {
     fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
         BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
new file mode 100644
index 0000000000..934400e5be
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -0,0 +1,218 @@
+//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
+//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+
+use crate::context::RequestContext;
+use crate::page_cache::{self, PAGE_SZ};
+use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::VirtualFile;
+
+use once_cell::sync::Lazy;
+use std::io::{self, ErrorKind};
+use tokio_epoll_uring::BoundedBuf;
+use tracing::*;
+
+use super::zero_padded_read_write;
+
+/// See module-level comment.
+pub struct RW {
+    page_cache_file_id: page_cache::FileId,
+    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+}
+
+impl RW {
+    pub fn new(file: VirtualFile) -> Self {
+        let page_cache_file_id = page_cache::next_file_id();
+        Self {
+            page_cache_file_id,
+            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
+                page_cache_file_id,
+                file,
+            )),
+        }
+    }
+
+    pub fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
+    pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
+        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
+        // because Compute is unlikely to access recently written data.
+        self.rw.write_all_borrowed(srcbuf).await
+    }
+
+    pub(crate) fn bytes_written(&self) -> u64 {
+        self.rw.bytes_written()
+    }
+
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        match self.rw.read_blk(blknum).await? {
+            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
+                let cache = page_cache::get();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.rw.as_writer().file.path,
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(write_guard) => {
+                        let write_guard = writer
+                            .file
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .await?;
+                        let read_guard = write_guard.mark_valid();
+                        return Ok(BlockLease::PageReadGuard(read_guard));
+                    }
+                }
+            }
+            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
+                Ok(BlockLease::EphemeralFileMutableTail(buffer))
+            }
+        }
+    }
+}
+
+impl Drop for RW {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.rw.as_writer().file.path,
+                    e
+                );
+            }
+        }
+    }
+}
+
+struct PreWarmingWriter {
+    nwritten_blocks: u32,
+    page_cache_file_id: page_cache::FileId,
+    file: VirtualFile,
+}
+
+impl PreWarmingWriter {
+    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+        Self {
+            nwritten_blocks: 0,
+            page_cache_file_id,
+            file,
+        }
+    }
+}
+
+impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
+    async fn write_all<
+        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
+        Buf: tokio_epoll_uring::IoBuf + Send,
+    >(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let buf = buf.slice(..);
+        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
+        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
+            Some(buf.to_vec())
+        } else {
+            None
+        };
+        let buflen = buf.len();
+        assert_eq!(
+            buflen % PAGE_SZ,
+            0,
+            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
+        );
+
+        // Do the IO.
+        let iobuf = match self.file.write_all(buf).await {
+            (iobuf, Ok(nwritten)) => {
+                assert_eq!(nwritten, buflen);
+                iobuf
+            }
+            (_, Err(e)) => {
+                return Err(std::io::Error::new(
+                    ErrorKind::Other,
+                    // order error before path because path is long and error is short
+                    format!(
+                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
+                        self.nwritten_blocks, buflen, e, self.file.path,
+                    ),
+                ));
+            }
+        };
+
+        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
+        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
+        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
+            assert_eq!(&check_bounds_stuff_works, &*buf);
+        }
+
+        // Pre-warm page cache with the contents.
+        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+        // benefits the code that writes InMemoryLayer=>L0 layers.
+        let nblocks = buflen / PAGE_SZ;
+        let nblocks32 = u32::try_from(nblocks).unwrap();
+        let cache = page_cache::get();
+        static CTX: Lazy<RequestContext> = Lazy::new(|| {
+            RequestContext::new(
+                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                crate::context::DownloadBehavior::Error,
+            )
+        });
+        for blknum_in_buffer in 0..nblocks {
+            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+            let blknum = self
+                .nwritten_blocks
+                .checked_add(blknum_in_buffer as u32)
+                .unwrap();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                .await
+            {
+                Err(e) => {
+                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                }
+                Ok(v) => match v {
+                    page_cache::ReadBufResult::Found(_guard) => {
+                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        write_guard.copy_from_slice(blk_in_buffer);
+                        let _ = write_guard.mark_valid();
+                    }
+                },
+            }
+        }
+        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
+        Ok((buflen, buf.into_inner()))
+    }
+}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
new file mode 100644
index 0000000000..34944b1072
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -0,0 +1,125 @@
+//! The heart of how [`super::EphemeralFile`] does its reads and writes.
+//!
+//! # Writes
+//!
+//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
+//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
+//!
+//! # Reads
+//!
+//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
+//!
+//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
+//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
+//! if the read is for the prefix that has already been flushed.
+//!
+//! # Current Usage
+//!
+//! The current user of this module is [`super::page_caching::RW`].
+
+mod zero_padded;
+
+use crate::{
+    page_cache::PAGE_SZ,
+    virtual_file::owned_buffers_io::{
+        self,
+        write::{Buffer, OwnedAsyncWriter},
+    },
+};
+
+const TAIL_SZ: usize = PAGE_SZ;
+
+/// See module-level comment.
+pub struct RW<W: OwnedAsyncWriter> {
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        zero_padded::Buffer<TAIL_SZ>,
+        owned_buffers_io::util::size_tracking_writer::Writer<W>,
+    >,
+}
+
+pub enum ReadResult<'a, W> {
+    NeedsReadFromWriter { writer: &'a W },
+    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
+}
+
+impl<W> RW<W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        let bytes_flushed_tracker =
+            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
+        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
+            bytes_flushed_tracker,
+            zero_padded::Buffer::default(),
+        );
+        Self { buffered_writer }
+    }
+
+    pub(crate) fn as_writer(&self) -> &W {
+        self.buffered_writer.as_inner().as_inner()
+    }
+
+    pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf).await
+    }
+
+    pub fn bytes_written(&self) -> u64 {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        flushed_offset + u64::try_from(buffer.pending()).unwrap()
+    }
+
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
+        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
+
+        // The trailing page ("block") might only be partially filled,
+        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
+        // Moreover, it has to be zero-padded, because when we still had
+        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
+        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
+        // => check here that the read doesn't go beyond this potentially trailing
+        // => the zero-padding is done in the `else` branch below
+        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
+            buffered_offset / (PAGE_SZ as u64)
+        } else {
+            (buffered_offset / (PAGE_SZ as u64)) + 1
+        };
+        if (blknum as u64) >= blocks_written {
+            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
+        }
+
+        // assertions for the `if-else` below
+        assert_eq!(
+            flushed_offset % (TAIL_SZ as u64), 0,
+            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
+        );
+        assert_eq!(
+            flushed_offset % (PAGE_SZ as u64),
+            0,
+            "the logic below can't handle if the page is spread across the flushed part and the buffer"
+        );
+
+        if read_offset < flushed_offset {
+            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
+            Ok(ReadResult::NeedsReadFromWriter {
+                writer: self.as_writer(),
+            })
+        } else {
+            let read_offset_in_buffer = read_offset
+                .checked_sub(flushed_offset)
+                .expect("would have taken `if` branch instead of this one");
+            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
+            let zero_padded_slice = buffer.as_zero_padded_slice();
+            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
+            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
+                buffer: page
+                    .try_into()
+                    .expect("the slice above got it as page-size slice"),
+            })
+        }
+    }
+}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
new file mode 100644
index 0000000000..f90291bbf8
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -0,0 +1,108 @@
+//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
+//! unwritten range is guaranteed to be zero-initialized.
+//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
+//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
+
+use std::mem::MaybeUninit;
+
+/// See module-level comment.
+pub struct Buffer<const N: usize> {
+    allocation: Box<[u8; N]>,
+    written: usize,
+}
+
+impl<const N: usize> Default for Buffer<N> {
+    fn default() -> Self {
+        Self {
+            allocation: Box::new(
+                // SAFETY: zeroed memory is a valid [u8; N]
+                unsafe { MaybeUninit::zeroed().assume_init() },
+            ),
+            written: 0,
+        }
+    }
+}
+
+impl<const N: usize> Buffer<N> {
+    #[inline(always)]
+    fn invariants(&self) {
+        // don't check by default, unoptimized is too expensive even for debug mode
+        if false {
+            debug_assert!(self.written <= N, "{}", self.written);
+            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
+        }
+    }
+
+    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
+        &self.allocation
+    }
+}
+
+impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
+    type IoBuf = Self;
+
+    fn cap(&self) -> usize {
+        self.allocation.len()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        self.invariants();
+        let remaining = self.allocation.len() - self.written;
+        if other.len() > remaining {
+            panic!("calling extend_from_slice() with insufficient remaining capacity");
+        }
+        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
+        self.written += other.len();
+        self.invariants();
+    }
+
+    fn pending(&self) -> usize {
+        self.written
+    }
+
+    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
+        self.invariants();
+        let written = self.written;
+        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
+    }
+
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let Self {
+            mut allocation,
+            written,
+        } = iobuf;
+        allocation[0..written].fill(0);
+        let new = Self {
+            allocation,
+            written: 0,
+        };
+        new.invariants();
+        new
+    }
+}
+
+/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
+/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
+///
+/// Remember that bytes_init is generally _not_ a tracker of the amount
+/// of valid data in the io buffer; we use `Slice` for that.
+/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
+///
+/// SAFETY:
+///
+/// The [`Self::allocation`] is stable becauses boxes are stable.
+/// The memory is zero-initialized, so, bytes_init is always N.
+unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.allocation.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        // Yes, N, not self.written; Read the full comment of this impl block!
+        N
+    }
+
+    fn bytes_total(&self) -> usize {
+        N
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 3744eecab5..1852e4b4ff 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -7,7 +7,6 @@ use std::collections::HashSet;
 use std::future::Future;
 
 use anyhow::{anyhow, Context};
-use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5939b969d6..8ec4d61434 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -482,7 +482,7 @@ impl InMemoryLayer {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
         let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.id());
+        let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
             file_id: key,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 1d43a94568..6127b35079 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -37,7 +37,6 @@ pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
-#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
 pub(crate) mod owned_buffers_io {
     //! Abstractions for IO with owned buffers.
     //!
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index edb11c5f4c..107ada4c13 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -14,6 +14,14 @@ impl<W> Writer<W> {
         }
     }
 
+    pub fn bytes_written(&self) -> u64 {
+        self.bytes_amount
+    }
+
+    pub fn as_inner(&self) -> &W {
+        &self.dst
+    }
+
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
     pub fn into_inner(self) -> (u64, W) {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 6b3a02c71a..d419f02f2d 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -47,6 +47,15 @@ where
         }
     }
 
+    pub fn as_inner(&self) -> &W {
+        &self.writer
+    }
+
+    /// Panics if used after any of the write paths returned an error
+    pub fn inspect_buffer(&self) -> &B {
+        self.buf()
+    }
+
     pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
         self.flush().await?;
         let Self { buf, writer } = self;
@@ -100,6 +109,28 @@ where
         Ok((chunk_len, chunk.into_inner()))
     }
 
+    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
+    ///
+    /// It is less performant because we always have to copy the borrowed data into the internal buffer
+    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
+    /// for large writes.
+    pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
+        let chunk_len = chunk.len();
+        while !chunk.is_empty() {
+            let buf = self.buf.as_mut().expect("must not use after an error");
+            let need = buf.cap() - buf.pending();
+            let have = chunk.len();
+            let n = std::cmp::min(need, have);
+            buf.extend_from_slice(&chunk[..n]);
+            chunk = &chunk[n..];
+            if buf.pending() >= buf.cap() {
+                assert_eq!(buf.pending(), buf.cap());
+                self.flush().await?;
+            }
+        }
+        Ok(chunk_len)
+    }
+
     async fn flush(&mut self) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
         let buf_len = buf.pending();
@@ -266,4 +297,31 @@ mod tests {
         );
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+
+        writer.write_buffered_borrowed(b"abc").await?;
+        writer.write_buffered_borrowed(b"d").await?;
+        writer.write_buffered_borrowed(b"e").await?;
+        writer.write_buffered_borrowed(b"fg").await?;
+        writer.write_buffered_borrowed(b"hi").await?;
+        writer.write_buffered_borrowed(b"j").await?;
+        writer.write_buffered_borrowed(b"klmno").await?;
+
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            {
+                let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
+                expect
+            }
+            .iter()
+            .map(|v| v[..].to_vec())
+            .collect::<Vec<_>>()
+        );
+        Ok(())
+    }
 }

From f1de18f1c9057510fb34d8241011a35d0f249d50 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Apr 2024 12:15:05 +0100
Subject: [PATCH 089/157] Remove unused import (#7519)

Linter error from a merge collision

From ed577727936b18479a6d04c2449bb77eb8245e19 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Apr 2024 13:34:28 +0200
Subject: [PATCH 090/157] perf!: use larger buffers for blob_io and
 ephemeral_file (#7485)

part of https://github.com/neondatabase/neon/issues/7124

# Problem

(Re-stating the problem from #7124 for posterity)

The `test_bulk_ingest` benchmark shows about 2x lower throughput with
`tokio-epoll-uring` compared to `std-fs`.
That's why we temporarily disabled it in #7238.

The reason for this regression is that the benchmark runs on a system
without memory pressure and thus std-fs writes don't block on disk IO
but only copy the data into the kernel page cache.
`tokio-epoll-uring` cannot beat that at this time, and possibly never.
(However, under memory pressure, std-fs would stall the executor thread
on kernel page cache writeback disk IO. That's why we want to use
`tokio-epoll-uring`. And we likely want to use O_DIRECT in the future,
at which point std-fs becomes an absolute show-stopper.)

More elaborate analysis:
https://neondatabase.notion.site/Why-test_bulk_ingest-is-slower-with-tokio-epoll-uring-918c5e619df045a7bd7b5f806cfbd53f?pvs=4

# Changes

This PR increases the buffer size of `blob_io` and `EphemeralFile` from
PAGE_SZ=8k to 64k.

Longer-term, we probably want to do double-buffering / pipelined IO.

# Resource Usage

We currently do not flush the buffer when freezing the InMemoryLayer.
That means a single Timeline can have multiple 64k buffers alive, esp if
flushing is slow.
This poses an OOM risk.

We should either bound the number of frozen layers
(https://github.com/neondatabase/neon/issues/7317).

Or we should change the freezing code to flush the buffer and drop the
allocation.

However, that's future work.

# Performance

(Measurements done on i3en.3xlarge.)

The `test_bulk_insert.py` is too noisy, even with instance storage. It
varies by 30-40%. I suspect that's due to compaction. Raising amount of
data by 10x doesn't help with the noisiness.)

So, I used the `bench_ingest` from @jcsp 's #7409  .
Specifically, the `ingest-small-values/ingest 128MB/100b seq` and
`ingest-small-values/ingest 128MB/100b seq, no delta` benchmarks.

|     |                   | seq | seq, no delta |
|-----|-------------------|-----|---------------|
| 8k  | std-fs            | 55  | 165           |
| 8k  | tokio-epoll-uring | 37  | 107           |
| 64k | std-fs            | 55  | 180           |
| 64k | tokio-epoll-uring | 48  | 164           |

The `8k` is from before this PR, the `64k` is with this PR.
The values are the throughput reported by the benchmark (MiB/s).

We see that this PR gets `tokio-epoll-uring` from 67% to 87% of `std-fs`
performance in the `seq` benchmark. Notably, `seq` appears to hit some
other bottleneck at `55 MiB/s`. CC'ing #7418 due to the apparent
bottlenecks in writing delta layers.

For `seq, no delta`, this PR gets `tokio-epoll-uring` from 64% to 91% of
`std-fs` performance.
---
 pageserver/src/tenant/blob_io.rs                               | 2 +-
 pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 0d33100ead..6e90b3e8ff 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         self.offset
     }
 
-    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
+    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
 
     /// Writes the given buffer directly to the underlying `VirtualFile`.
     /// You need to make sure that the internal buffer is empty, otherwise
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index 34944b1072..4159b5820a 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -27,7 +27,7 @@ use crate::{
     },
 };
 
-const TAIL_SZ: usize = PAGE_SZ;
+const TAIL_SZ: usize = 64 * 1024;
 
 /// See module-level comment.
 pub struct RW<W: OwnedAsyncWriter> {

From af43f78561cb8603e0b864cbfb18f5324155b613 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 26 Apr 2024 14:53:05 +0100
Subject: [PATCH 091/157] pageserver: fix image layer creation check that
 inhibited compaction (#7420)

## Problem
PR #7230 attempted to introduce a WAL ingest threshold for checking
whether enough deltas are stacked to warrant creating a new image layer.
However, this check was incorrectly performed at the compaction
partition level instead of the timeline level. Hence, it inhibited GC
for any keys outside of the first partition.

## Summary of Changes
Hoist the check up to the timeline level.
---
 pageserver/src/tenant/timeline.rs      | 48 ++++++++++++++++----------
 test_runner/regress/test_compaction.py |  2 --
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f1387e10ac..eb72ce9629 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3906,24 +3906,6 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
-        let last = self.last_image_layer_creation_check_at.load();
-        if lsn != Lsn(0) {
-            let distance = lsn
-                .checked_sub(last)
-                .expect("Attempt to compact with LSN going backwards");
-
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting below if we've not ingested
-            // sufficient WAL since the last check.
-            if distance.0 < min_distance {
-                return false;
-            }
-        }
-
-        self.last_image_layer_creation_check_at.store(lsn);
-
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
@@ -3995,9 +3977,37 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
+        let check_for_image_layers = {
+            let last_checks_at = self.last_image_layer_creation_check_at.load();
+            let distance = lsn
+                .checked_sub(last_checks_at)
+                .expect("Attempt to compact with LSN going backwards");
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
+            // WAL since the last check.
+            distance.0 >= min_distance
+        };
+
+        if check_for_image_layers {
+            self.last_image_layer_creation_check_at.store(lsn);
+        }
+
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-            if !force && !self.time_for_new_image_layer(partition, lsn).await {
+
+            let do_it = if force {
+                true
+            } else if check_for_image_layers {
+                // [`Self::time_for_new_image_layer`] is CPU expensive,
+                // so skip if we've not collected enough WAL since the last time
+                self.time_for_new_image_layer(partition, lsn).await
+            } else {
+                false
+            };
+
+            if !do_it {
                 start = img_range.end;
                 continue;
             }
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 37b87b92a9..3902819d3d 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -14,8 +14,6 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
     # Compact small layers
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 2,
-    # INC-186: remove when merging the fix
-    "image_layer_creation_check_threshold": 0,
 }
 
 
From 39427925c2f9fa6966aec9da66408aa134d30ab4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:23:25 +0200
Subject: [PATCH 092/157] Return Past instead of Present or Future when
 commit_lsn < min_lsn (#7520)

Implements an approach different from the one #7488 chose: We now return
`past` instead of `present` (or`future`) when encountering the edge case
where commit_lsn < min_lsn. In my opinion, both `past` and `present` are
correct responses, but past is slightly better as the lsn returned by
`present` with #7488 is one too "new". In practice, this shouldn't
matter much, but shrug.

We agreed in slack that this is the better approach:
https://neondb.slack.com/archives/C03F5SM1N02/p1713871064147029
---
 pageserver/src/pgdatadir_mapping.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 14bcc50e7e..c76c2d5451 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -445,11 +445,6 @@ impl Timeline {
         // include physical changes from later commits that will be marked
         // as aborted, and will need to be vacuumed away.
         let commit_lsn = Lsn((low - 1) * 8);
-        // This maxing operation is for the edge case that the search above did
-        // set found_smaller to true but it never increased the lsn. Then, low
-        // is still the old min_lsn the subtraction above could possibly give a value
-        // below the anchestor_lsn.
-        let commit_lsn = commit_lsn.max(min_lsn);
         match (found_smaller, found_larger) {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.
@@ -460,6 +455,12 @@ impl Timeline {
                 // Didn't find any commit timestamps smaller than the request
                 Ok(LsnForTimestamp::Past(min_lsn))
             }
+            (true, _) if commit_lsn < min_lsn => {
+                // the search above did set found_smaller to true but it never increased the lsn.
+                // Then, low is still the old min_lsn, and the subtraction above gave a value
+                // below the min_lsn. We should never do that.
+                Ok(LsnForTimestamp::Past(min_lsn))
+            }
             (true, false) => {
                 // Only found commits with timestamps smaller than the request.
                 // It's still a valid case for branch creation, return it.

From dbe0aa653ac2d0c3ef0a8087b7ab8878d1e59c9a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 Apr 2024 11:48:47 -0400
Subject: [PATCH 093/157] feat(pageserver): add aux-file-v2 flag on tenant
 level (#7505)

Changing metadata format is not easy. This pull request adds a
tenant-level flag on whether to enable aux file v2. As long as we don't
roll this out to the user and guarantee our staging projects can persist
tenant config correctly, we can test the aux file v2 change with setting
this flag. Previous discussion at
https://github.com/neondatabase/neon/pull/7424.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs                  | 10 ++++++++++
 libs/pageserver_api/src/models.rs                |  1 +
 pageserver/src/tenant.rs                         |  1 +
 pageserver/src/tenant/config.rs                  | 13 +++++++++++++
 pageserver/src/tenant/timeline.rs                |  9 +++++++++
 test_runner/regress/test_attach_tenant_config.py |  1 +
 6 files changed, 35 insertions(+)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index adac7d7bb5..0699e47866 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -441,6 +441,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
+            switch_to_aux_file_v2: settings
+                .remove("switch_to_aux_file_v2")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -559,6 +564,11 @@ impl PageServerNode {
                     .map(serde_json::from_str)
                     .transpose()
                     .context("parse `timeline_get_throttle` from json")?,
+                switch_to_aux_file_v2: settings
+                    .remove("switch_to_aux_file_v2")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 4ce1ecde26..e2acde6139 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -303,6 +303,7 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
+    pub switch_to_aux_file_v2: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ff6194ab00..32c0606fc2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3664,6 +3664,7 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
+                switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a2bb479f63..9975c9edbc 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -369,6 +369,10 @@ pub struct TenantConf {
     // How much WAL must be ingested before checking again whether a new image layer is required.
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    pub switch_to_aux_file_v2: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -464,6 +468,10 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub switch_to_aux_file_v2: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -521,6 +529,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            switch_to_aux_file_v2: self
+                .switch_to_aux_file_v2
+                .unwrap_or(global_conf.switch_to_aux_file_v2),
         }
     }
 }
@@ -562,6 +573,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_to_aux_file_v2: false,
         }
     }
 }
@@ -636,6 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            switch_to_aux_file_v2: value.switch_to_aux_file_v2,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index eb72ce9629..a05e0da260 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1871,6 +1871,15 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
+    #[allow(dead_code)]
+    pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .switch_to_aux_file_v2
+            .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
+    }
+
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 909d25980b..59461cc095 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,6 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
+        "switch_to_aux_file_v2": True,
     }
 
     ps_http = env.pageserver.http_client()

From ee3437cbd8d539d00cc0789b7314d8a995668a9d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 Apr 2024 13:35:01 -0400
Subject: [PATCH 094/157] chore(pageserver): shrink aux keyspace to 0x60-0x7F
 (#7502)

extracted from https://github.com/neondatabase/neon/pull/7468, part of
https://github.com/neondatabase/neon/issues/7462.

In the page server, we use i128 (instead of u128) to do the integer
representation of the key, which indicates that the highest bit of the
key should not be 1. This constraints our keyspace to <= 0x7F.

Also fix the bug of `to_i128` that dropped the highest 4b. Now we keep
3b of them, dropping the sign bit.

And on that, we shrink the metadata keyspace to 0x60-0x7F for now, and
once we add support for u128, we can have a larger metadata keyspace.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs      | 53 ++++++++++++++++++++---------
 libs/pageserver_api/src/keyspace.rs | 16 ++++++---
 pageserver/src/aux_file.rs          | 12 +++----
 3 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 01919e8325..ea6115853e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -4,7 +4,6 @@ use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};
 
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -30,24 +29,25 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
-pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
+pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
 
 /// The (reserved) key prefix of relation sizes.
-pub const RELATION_SIZE_PREFIX: u8 = 0x81;
+pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 
 /// The key prefix of AUX file keys.
-pub const AUX_KEY_PREFIX: u8 = 0x82;
+pub const AUX_KEY_PREFIX: u8 = 0x62;
 
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
-    key[0] >= METADATA_KEY_BEGIN_PREFIX
+    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
 }
 
 impl Key {
     /// Check if the key falls in the range of metadata keys.
     pub const fn is_metadata_key(&self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
     }
 
     /// Encode a metadata key to a storage key.
@@ -80,7 +80,7 @@ impl Key {
     }
 
     /// Get the range of metadata keys.
-    pub fn metadata_key_range() -> RangeInclusive<Self> {
+    pub fn metadata_key_range() -> Range<Self> {
         Key {
             field1: METADATA_KEY_BEGIN_PREFIX,
             field2: 0,
@@ -88,13 +88,32 @@ impl Key {
             field4: 0,
             field5: 0,
             field6: 0,
-        }..=Key {
-            field1: u8::MAX,
-            field2: u16::MAX as u32,
-            field3: u32::MAX,
-            field4: u32::MAX,
-            field5: u8::MAX,
-            field6: u32::MAX,
+        }..Key {
+            field1: METADATA_KEY_END_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
+    /// Get the range of aux keys.
+    pub fn metadata_aux_key_range() -> Range<Self> {
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: AUX_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
         }
     }
 
@@ -103,7 +122,7 @@ impl Key {
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
         assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
+        (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
             | ((self.field4 as i128) << 40)
@@ -113,7 +132,7 @@ impl Key {
 
     pub const fn from_i128(x: i128) -> Self {
         Key {
-            field1: ((x >> 120) & 0xf) as u8,
+            field1: ((x >> 120) & 0x7F) as u8,
             field2: ((x >> 104) & 0xFFFF) as u32,
             field3: (x >> 72) as u32,
             field4: (x >> 40) as u32,
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index f73648d306..eed4835f25 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -15,7 +15,13 @@ pub struct KeySpace {
 }
 
 impl KeySpace {
-    ///
+    /// Create a key space with a single range.
+    pub fn single(key_range: Range<Key>) -> Self {
+        Self {
+            ranges: vec![key_range],
+        }
+    }
+
     /// Partition a key space into roughly chunks of roughly 'target_size' bytes
     /// in each partition.
     ///
@@ -64,6 +70,10 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.total_size() == 0
+    }
+
     /// Merge another keyspace into the current one.
     /// Note: the keyspaces must not ovelap (enforced via assertions)
     pub fn merge(&mut self, other: &KeySpace) {
@@ -162,10 +172,6 @@ impl KeySpace {
             .sum()
     }
 
-    pub fn is_empty(&self) -> bool {
-        self.total_size() == 0
-    }
-
     fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
         match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
             Ok(0) => None,
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index aba4ccf19d..a343acaf7a 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -85,27 +85,27 @@ mod tests {
         // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
         // of the page server.
         assert_eq!(
-            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
             encode_aux_file_key("pg_logical/mappings/test1").to_string()
         );
         assert_eq!(
-            "820000010239AAC544893139B26F501B97E6",
+            "620000010239AAC544893139B26F501B97E6",
             encode_aux_file_key("pg_logical/snapshots/test2").to_string()
         );
         assert_eq!(
-            "820000010300000000000000000000000000",
+            "620000010300000000000000000000000000",
             encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
         );
         assert_eq!(
-            "82000001FF8635AF2134B7266EC5B4189FD6",
+            "62000001FF8635AF2134B7266EC5B4189FD6",
             encode_aux_file_key("pg_logical/unsupported").to_string()
         );
         assert_eq!(
-            "8200000201772D0E5D71DE14DA86142A1619",
+            "6200000201772D0E5D71DE14DA86142A1619",
             encode_aux_file_key("pg_replslot/test3").to_string()
         );
         assert_eq!(
-            "820000FFFF1866EBEB53B807B26A2416F317",
+            "620000FFFF1866EBEB53B807B26A2416F317",
             encode_aux_file_key("other_file_not_supported").to_string()
         );
     }

From 75b4440d0786b4f53c5ca26e9c7ed8b88bc4b40b Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 Apr 2024 17:09:51 -0400
Subject: [PATCH 095/157] fix(virtual_file): compile warnings on macos (#7525)

starting at commit
https://github.com/neondatabase/neon/commit/dbb0c967d5fb5104847fb71e8d783ebeae3e7ff2,
macOS reports warning for a few functions in the virtual file module.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../virtual_file/owned_buffers_io/util/size_tracking_writer.rs  | 1 +
 pageserver/src/virtual_file/owned_buffers_io/write.rs           | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 107ada4c13..c2817699c3 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -24,6 +24,7 @@ impl<W> Writer<W> {
 
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub fn into_inner(self) -> (u64, W) {
         (self.bytes_amount, self.dst)
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index d419f02f2d..738a642332 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -56,6 +56,7 @@ where
         self.buf()
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
         self.flush().await?;
         let Self { buf, writer } = self;
@@ -70,6 +71,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
     where
         S: IoBuf + Send,

From 3695a1efa1c88c3b98106f5a2a8e74d655e467b1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 Apr 2024 07:14:53 +0300
Subject: [PATCH 096/157] metrics: record time to update gc info as a per
 timeline metric (#7473)

We know that updating gc info can take a very long time from [recent
incident], and holding `Tenant::gc_cs` affects many per-tenant
operations in the system. We need a direct way to observe the time it
takes. The solution is to add metrics so that we know when this happens:
- 2 new per-timeline metric
- 1 new global histogram

Verified that the buckets are okay-ish in [dashboard]. In our current
state, we will see a lot more of `Inf,` but that is probably okay; at
least we can learn which timelines are having issues.

Can we afford to add these metrics? A bit unclear, see [another
dashboard] with top pageserver `/metrics` response sizes.

[dashboard]:
https://neonprod.grafana.net/d/b7a5a5e2-1276-4bb0-9e3a-b4528adb6eb6/storage-operations-histograms-in-prod?orgId=1&var-datasource=ZNX49CDVz&var-instance=All&var-operation=All&from=now-7d&to=now

[another dashboard]:
https://neonprod.grafana.net/d/MQx4SN-Vk/metric-sizes-on-prod-and-some-correlations?orgId=1

[recent incident]:
https://neondb.slack.com/archives/C06UEMLK7FE/p1713817696580119?thread_ts=1713468604.508969&cid=C06UEMLK7FE
---
 pageserver/src/metrics.rs         | 27 +++++++++++++++++++++++++++
 pageserver/src/tenant.rs          |  6 +++++-
 pageserver/src/tenant/size.rs     |  3 +++
 pageserver/src/tenant/timeline.rs |  6 ++++++
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e4b314f805..d3c8c423e4 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,6 +51,9 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "gc")]
     Gc,
 
+    #[strum(serialize = "update gc info")]
+    UpdateGcInfo,
+
     #[strum(serialize = "create tenant")]
     CreateTenant,
 }
@@ -1910,6 +1913,22 @@ impl StorageTimeMetricsTimer {
         self.metrics.timeline_count.inc();
         self.metrics.global_histogram.observe(duration);
     }
+
+    /// Turns this timer into a timer, which will always record -- usually this means recording
+    /// regardless an early `?` path was taken in a function.
+    pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
+        AlwaysRecordingStorageTimeMetricsTimer(Some(self))
+    }
+}
+
+pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
+
+impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
+    fn drop(&mut self) {
+        if let Some(inner) = self.0.take() {
+            inner.stop_and_record();
+        }
+    }
 }
 
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -1970,6 +1989,7 @@ pub(crate) struct TimelineMetrics {
     pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
+    pub update_gc_info_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2030,6 +2050,12 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let update_gc_info_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::UpdateGcInfo,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2072,6 +2098,7 @@ impl TimelineMetrics {
             logical_size_histo,
             imitate_logical_size_histo,
             garbage_collect_histo,
+            update_gc_info_histo,
             load_layer_map_histo,
             last_record_gauge,
             resident_physical_size_gauge,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 32c0606fc2..02ce65922e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3402,7 +3402,11 @@ impl Tenant {
         // is in progress (which is not a common case).
         //
         // See more for on the issue #2748 condenced out of the initial PR review.
-        let mut shared_cache = self.cached_logical_sizes.lock().await;
+        let mut shared_cache = tokio::select! {
+            locked = self.cached_logical_sizes.lock() => locked,
+            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
+            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
+        };
 
         size::gather_inputs(
             self,
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index ad79b74d8b..f521dfa55d 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,6 +118,9 @@ pub(super) async fn gather_inputs(
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    //
+    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
+    // whole computation. It does not make sense from the billing perspective.
     tenant
         .refresh_gc_info(cancel, ctx)
         .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a05e0da260..c10adf4c22 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4345,6 +4345,12 @@ impl Timeline {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let _timer = self
+            .metrics
+            .update_gc_info_histo
+            .start_timer()
+            .record_on_drop();
+
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
         // Some unit tests depend on garbage-collection working even when

From b655c7030ff2172e32f0c6e2e056183aa3b70d81 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 08:52:18 +0100
Subject: [PATCH 097/157] neon_local: add "tenant import" (#7399)

## Problem

Sometimes we have test data in the form of S3 contents that we would
like to run live in a neon_local environment.

## Summary of changes

- Add a storage controller API that imports an existing tenant.
Currently this is equivalent to doing a create with a high generation
number, but in future this would be something smarter to probe S3 to
find the shards in a tenant and find generation numbers.
- Add a `neon_local` command that invokes the import API, and then
inspects timelines in the newly attached tenant to create matching
branches.
---
 control_plane/src/bin/neon_local.rs           |  50 +++++++
 control_plane/src/storage_controller.rs       |  10 ++
 libs/pageserver_api/src/models.rs             |  11 ++
 libs/utils/src/generation.rs                  |   2 +
 pageserver/client/src/mgmt_api.rs             |  13 ++
 pageserver/src/http/routes.rs                 |  82 +++++++++++
 pageserver/src/tenant.rs                      |  12 +-
 .../src/tenant/remote_timeline_client.rs      |  11 +-
 .../tenant/remote_timeline_client/download.rs |  80 ++++++-----
 storage_controller/src/http.rs                |  19 +++
 storage_controller/src/pageserver_client.rs   |  17 ++-
 storage_controller/src/service.rs             | 130 ++++++++++++++++--
 test_runner/fixtures/neon_fixtures.py         |  12 ++
 .../regress/test_storage_controller.py        |  83 +++++++++++
 test_runner/regress/test_tenant_detach.py     |   2 +-
 15 files changed, 481 insertions(+), 53 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 7f8f6d21e0..1a9e9a1e6a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -417,6 +417,54 @@ async fn handle_tenant(
                 println!("{} {:?}", t.id, t.state);
             }
         }
+        Some(("import", import_match)) => {
+            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
+
+            let storage_controller = StorageController::from_env(env);
+            let create_response = storage_controller.tenant_import(tenant_id).await?;
+
+            let shard_zero = create_response
+                .shards
+                .first()
+                .expect("Import response omitted shards");
+
+            let attached_pageserver_id = shard_zero.node_id;
+            let pageserver =
+                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
+
+            println!(
+                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
+            );
+
+            let timelines = pageserver
+                .http_client
+                .list_timelines(shard_zero.shard_id)
+                .await?;
+
+            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
+            let main_timeline = timelines
+                .iter()
+                .find(|t| t.ancestor_timeline_id.is_none())
+                .expect("No timelines found")
+                .timeline_id;
+
+            let mut branch_i = 0;
+            for timeline in timelines.iter() {
+                let branch_name = if timeline.timeline_id == main_timeline {
+                    "main".to_string()
+                } else {
+                    branch_i += 1;
+                    format!("branch_{branch_i}")
+                };
+
+                println!(
+                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
+                    timeline.timeline_id
+                );
+
+                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
+            }
+        }
         Some(("create", create_match)) => {
             let tenant_conf: HashMap<_, _> = create_match
                 .get_many::<String>("config")
@@ -1480,6 +1528,8 @@ fn cli() -> Command {
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
+                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 7f2b973391..dbb4475ae8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -472,6 +472,16 @@ impl StorageController {
             .await
     }
 
+    #[instrument(skip(self))]
+    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
+        self.dispatch::<(), TenantCreateResponse>(
+            Method::POST,
+            format!("debug/v1/tenant/{tenant_id}/import"),
+            None,
+        )
+        .await
+    }
+
     #[instrument(skip(self))]
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
         self.dispatch::<(), _>(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e2acde6139..c752799c4c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -782,6 +782,17 @@ pub struct SecondaryProgress {
     pub bytes_total: u64,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantScanRemoteStorageShard {
+    pub tenant_shard_id: TenantShardId,
+    pub generation: Option<u32>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TenantScanRemoteStorageResponse {
+    pub shards: Vec<TenantScanRemoteStorageShard>,
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index af15cee924..b703e883de 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,6 +34,8 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
+    pub const MAX: Self = Self::Valid(u32::MAX);
+
     /// Create a new Generation that represents a legacy key format with
     /// no generation suffix
     pub fn none() -> Self {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 892e6c2758..012cb1a662 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -243,6 +243,19 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_scan_remote_storage(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantScanRemoteStorageResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
+            self.mgmt_api_endpoint
+        );
+        let response = self.request(Method::GET, &uri, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 81508965b4..9a280c2e0c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,6 +19,8 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantScanRemoteStorageResponse;
+use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
@@ -29,6 +31,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -54,6 +57,9 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
+use crate::tenant::remote_timeline_client::download_index_part;
+use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
+use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -2035,6 +2041,79 @@ async fn secondary_upload_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn tenant_scan_remote_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+
+    let Some(remote_storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote storage not configured"
+        )));
+    };
+
+    let mut response = TenantScanRemoteStorageResponse::default();
+
+    let (shards, _other_keys) =
+        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+            .await
+            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+
+    for tenant_shard_id in shards {
+        let (timeline_ids, _other_keys) =
+            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+                .await
+                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+
+        let mut generation = Generation::none();
+        for timeline_id in timeline_ids {
+            match download_index_part(
+                remote_storage,
+                &tenant_shard_id,
+                &timeline_id,
+                Generation::MAX,
+                &cancel,
+            )
+            .instrument(info_span!("download_index_part",
+                         tenant_id=%tenant_shard_id.tenant_id,
+                         shard_id=%tenant_shard_id.shard_slug(),
+                         %timeline_id))
+            .await
+            {
+                Ok((index_part, index_generation)) => {
+                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
+                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
+                    generation = std::cmp::max(generation, index_generation);
+                }
+                Err(DownloadError::NotFound) => {
+                    // This is normal for tenants that were created with multiple shards: they have an unsharded path
+                    // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
+                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
+                    continue;
+                }
+                Err(e) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            };
+        }
+
+        response.shards.push(TenantScanRemoteStorageShard {
+            tenant_shard_id,
+            generation: generation.into(),
+        });
+    }
+
+    if response.shards.is_empty() {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
+        ));
+    }
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn secondary_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2431,6 +2510,9 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
+        .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
+            api_handler(r, tenant_scan_remote_handler)
+        })
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 02ce65922e..cb3e36efb3 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -888,7 +888,7 @@ impl Tenant {
 
     #[instrument(skip_all)]
     pub(crate) async fn preload(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
         remote_storage: &GenericRemoteStorage,
         cancel: CancellationToken,
     ) -> anyhow::Result<TenantPreload> {
@@ -918,9 +918,13 @@ impl Tenant {
 
         Ok(TenantPreload {
             deleting,
-            timelines: self
-                .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
-                .await?,
+            timelines: Self::load_timeline_metadata(
+                self,
+                remote_timeline_ids,
+                remote_storage,
+                cancel,
+            )
+            .await?,
         })
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index d02f00adad..c0767345ca 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -243,7 +243,9 @@ use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 
-pub(crate) use download::{is_temp_download_file, list_remote_timelines};
+pub(crate) use download::{
+    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
+};
 pub(crate) use index::LayerFileMetadata;
 
 // Occasional network issues and such can cause remote operations to fail, and
@@ -472,7 +474,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        let index_part = download::download_index_part(
+        let (index_part, _index_generation) = download::download_index_part(
             &self.storage_impl,
             &self.tenant_shard_id,
             &self.timeline_id,
@@ -1716,6 +1718,11 @@ impl RemoteTimelineClient {
     }
 }
 
+pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 1852e4b4ff..250354ac20 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,6 +5,7 @@
 
 use std::collections::HashSet;
 use std::future::Future;
+use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -25,13 +26,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
-    INITDB_PATH,
+    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -253,42 +254,31 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
     }
 }
 
-/// List timelines of given tenant in remote storage
-pub async fn list_remote_timelines(
+async fn list_identifiers<T>(
     storage: &GenericRemoteStorage,
-    tenant_shard_id: TenantShardId,
+    prefix: RemotePath,
     cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
-
-    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
-        anyhow::bail!("storage-sync-list-remote-timelines");
-    });
-
+) -> anyhow::Result<(HashSet<T>, HashSet<String>)>
+where
+    T: FromStr + Eq + std::hash::Hash,
+{
     let listing = download_retry_forever(
-        || {
-            storage.list(
-                Some(&remote_path),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-        },
-        &format!("list timelines for {tenant_shard_id}"),
+        || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel),
+        &format!("list identifiers in prefix {prefix}"),
         &cancel,
     )
     .await?;
 
-    let mut timeline_ids = HashSet::new();
+    let mut parsed_ids = HashSet::new();
     let mut other_prefixes = HashSet::new();
 
-    for timeline_remote_storage_key in listing.prefixes {
-        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
+    for id_remote_storage_key in listing.prefixes {
+        let object_name = id_remote_storage_key.object_name().ok_or_else(|| {
+            anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}")
         })?;
 
-        match object_name.parse::<TimelineId>() {
-            Ok(t) => timeline_ids.insert(t),
+        match object_name.parse::<T>() {
+            Ok(t) => parsed_ids.insert(t),
             Err(_) => other_prefixes.insert(object_name.to_string()),
         };
     }
@@ -300,7 +290,31 @@ pub async fn list_remote_timelines(
         other_prefixes.insert(object_name.to_string());
     }
 
-    Ok((timeline_ids, other_prefixes))
+    Ok((parsed_ids, other_prefixes))
+}
+
+/// List shards of given tenant in remote storage
+pub(crate) async fn list_remote_tenant_shards(
+    storage: &GenericRemoteStorage,
+    tenant_id: TenantId,
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TenantShardId>, HashSet<String>)> {
+    let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id));
+    list_identifiers::<TenantShardId>(storage, remote_path, cancel).await
+}
+
+/// List timelines of given tenant shard in remote storage
+pub async fn list_remote_timelines(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: TenantShardId,
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
+    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }
 
 async fn do_download_index_part(
@@ -309,7 +323,7 @@ async fn do_download_index_part(
     timeline_id: &TimelineId,
     index_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<IndexPart, DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let index_part_bytes = download_retry_forever(
@@ -334,7 +348,7 @@ async fn do_download_index_part(
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok(index_part)
+    Ok((index_part, index_generation))
 }
 
 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -343,13 +357,13 @@ async fn do_download_index_part(
 /// In this function we probe for the most recent index in a generation <= our current generation.
 /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(super) async fn download_index_part(
+pub(crate) async fn download_index_part(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     my_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<IndexPart, DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 2e83bbc5ed..09a25a5be0 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -522,6 +522,18 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
+async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.tenant_import(tenant_id).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -759,6 +771,13 @@ pub fn make_router(
         .post("/debug/v1/node/:node_id/drop", |r| {
             named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
         })
+        .post("/debug/v1/tenant/:tenant_id/import", |r| {
+            named_request_span(
+                r,
+                handle_tenant_import,
+                RequestName("debug_v1_tenant_import"),
+            )
+        })
         .get("/debug/v1/tenant", |r| {
             named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
         })
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 8237229d7b..0cea205599 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,13 +1,14 @@
 use pageserver_api::{
     models::{
         LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
     },
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
-use utils::id::{NodeId, TimelineId};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -88,6 +89,18 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn tenant_scan_remote_storage(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantScanRemoteStorageResponse> {
+        measured_request!(
+            "tenant_scan_remote_storage",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.tenant_scan_remote_storage(tenant_id).await
+        )
+    }
+
     pub(crate) async fn tenant_secondary_download(
         &self,
         tenant_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 952664e339..df1008b64e 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -110,6 +110,42 @@ struct ServiceState {
     delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
 
+/// Transform an error from a pageserver into an error to return to callers of a storage
+/// controller API.
+fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
+    match e {
+        mgmt_api::Error::ReceiveErrorBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {str}").into(),
+            )
+        }
+        mgmt_api::Error::ReceiveBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        }
+        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
+            ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
+        }
+        mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => {
+            ApiError::ResourceUnavailable(format!("{node}: {msg}").into())
+        }
+        mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg)
+        | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => {
+            // Auth errors talking to a pageserver are not auth errors for the caller: they are
+            // internal server errors, showing that something is wrong with the pageserver or
+            // storage controller's auth configuration.
+            ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
+        }
+        mgmt_api::Error::ApiError(status, msg) => {
+            // Presume general case of pageserver API errors is that we tried to do something
+            // that can't be done right now.
+            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
+        }
+        mgmt_api::Error::Cancelled => ApiError::ShuttingDown,
+    }
+}
+
 impl ServiceState {
     fn new(
         nodes: HashMap<NodeId, Node>,
@@ -2519,17 +2555,7 @@ impl Service {
             client
                 .timeline_create(tenant_shard_id, &create_req)
                 .await
-                .map_err(|e| match e {
-                    mgmt_api::Error::ApiError(status, msg)
-                        if status == StatusCode::INTERNAL_SERVER_ERROR
-                            || status == StatusCode::NOT_ACCEPTABLE =>
-                    {
-                        // TODO: handle more error codes, e.g. 503 should be passed through.  Make a general wrapper
-                        // for pass-through API calls.
-                        ApiError::InternalServerError(anyhow::anyhow!(msg))
-                    }
-                    _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
-                })
+                .map_err(|e| passthrough_api_error(&node, e))
         }
 
         // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
@@ -3654,6 +3680,88 @@ impl Service {
         Ok(())
     }
 
+    /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a
+    /// tenant with a very high generation number so that it will see the existing data.
+    pub(crate) async fn tenant_import(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantCreateResponse, ApiError> {
+        // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage
+        let maybe_node = {
+            self.inner
+                .read()
+                .unwrap()
+                .nodes
+                .values()
+                .find(|n| n.is_available())
+                .cloned()
+        };
+        let Some(node) = maybe_node else {
+            return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available")));
+        };
+
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.config.jwt_token.as_deref(),
+        );
+
+        let scan_result = client
+            .tenant_scan_remote_storage(tenant_id)
+            .await
+            .map_err(|e| passthrough_api_error(&node, e))?;
+
+        // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count.
+        let Some(shard_count) = scan_result
+            .shards
+            .iter()
+            .map(|s| s.tenant_shard_id.shard_count)
+            .max()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("No shards found").into(),
+            ));
+        };
+
+        // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient
+        // to
+        let generation = scan_result
+            .shards
+            .iter()
+            .map(|s| s.generation)
+            .max()
+            .expect("We already validated >0 shards");
+
+        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
+        // only work if they were using the default stripe size.
+        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
+
+        let (response, waiters) = self
+            .do_tenant_create(TenantCreateRequest {
+                new_tenant_id: TenantShardId::unsharded(tenant_id),
+                generation,
+
+                shard_parameters: ShardParameters {
+                    count: shard_count,
+                    stripe_size,
+                },
+                placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
+
+                // There is no way to know what the tenant's config was: revert to defaults
+                config: TenantConfig::default(),
+            })
+            .await?;
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this
+            // tenant doesn't exist in the control plane), so don't fail the request if it can't fully
+            // reconcile, as reconciliation includes notifying compute.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})");
+        }
+
+        Ok(response)
+    }
+
     /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
     /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fcd33bb66a..a94732a682 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1575,6 +1575,11 @@ class NeonCli(AbstractNeonCli):
         res.check_returncode()
         return tenant_id, timeline_id
 
+    def import_tenant(self, tenant_id: TenantId):
+        args = ["tenant", "import", "--tenant-id", str(tenant_id)]
+        res = self.raw_cli(args)
+        res.check_returncode()
+
     def set_default(self, tenant_id: TenantId):
         """
         Update default tenant for future operations that require tenant_id.
@@ -2207,6 +2212,13 @@ class NeonStorageController(MetricsGetter):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def tenant_import(self, tenant_id: TenantId):
+        self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def reconcile_all(self):
         r = self.request(
             "POST",
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b4b23745f8..bc1f8776b3 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -26,6 +26,7 @@ from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
+from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
@@ -1256,3 +1257,85 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # Quiesce any background reconciliation before doing consistency check
     env.storage_controller.reconcile_until_idle(timeout_secs=10)
     env.storage_controller.consistency_check()
+
+
+@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage):
+    """
+    Tenant import is a support/debug tool for recovering a tenant from remote storage
+    if we don't have any metadata for it in the storage controller.
+    """
+
+    # This test is parametrized on remote storage because it exercises the relatively rare
+    # code path of listing with a prefix that is not a directory name: this helps us notice
+    # quickly if local_fs or s3_bucket implementations diverge.
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage)
+
+    # Use multiple pageservers because some test helpers assume single sharded tenants
+    # if there is only one pageserver.
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    tenant_id = env.initial_tenant
+
+    # Create a second timeline to ensure that import finds both
+    timeline_a = env.initial_timeline
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    workload_a = Workload(env, tenant_id, timeline_a, branch_name="main")
+    workload_a.init()
+
+    workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload_b.init()
+
+    # Write some data
+    workload_a.write_rows(72)
+    expect_rows_a = workload_a.expect_rows
+    workload_a.stop()
+    del workload_a
+
+    # Bump generation to make sure generation recovery works properly
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Write some data in the higher generation into the other branch
+    workload_b.write_rows(107)
+    expect_rows_b = workload_b.expect_rows
+    workload_b.stop()
+    del workload_b
+
+    # Detach from pageservers
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "placement": "Detached",
+        },
+    )
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+
+    # Force-drop it from the storage controller
+    env.storage_controller.request(
+        "POST",
+        f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+    )
+
+    # Now import it again
+    env.neon_cli.import_tenant(tenant_id)
+
+    # Check we found the shards
+    describe = env.storage_controller.tenant_describe(tenant_id)
+    literal_shard_count = 1 if shard_count is None else shard_count
+    assert len(describe["shards"]) == literal_shard_count
+
+    # Check the data is still there: this implicitly proves that we recovered generation numbers
+    # properly, for the timeline which was written to after a generation bump.
+    for timeline, branch, expect_rows in [
+        (timeline_a, "main", expect_rows_a),
+        (timeline_b, "branch_1", expect_rows_b),
+    ]:
+        workload = Workload(env, tenant_id, timeline, branch_name=branch)
+        workload.expect_rows = expect_rows
+        workload.validate()
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index d3f24cb06e..0ba0108651 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -132,7 +132,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
             assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
 
         # Check that we had to retry the downloads
-        assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*")
         assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
From 84914434e3fc63a26b817ba3fe8c2f0c8e545ea8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 09:59:22 +0100
Subject: [PATCH 098/157] storage controller: send startup compute
 notifications in background (#7495)

## Problem

Previously, we try to send compute notifications in startup_reconcile
before completing that function, with a time limit. Any notifications
that don't happen within the time limit result in tenants having their
`pending_compute_notification` flag set, which causes them to spawn a
Reconciler next time the background reconciler loop runs.

This causes two problems:
- Spawning a lot of reconcilers after startup caused a spike in memory
(this is addressed in https://github.com/neondatabase/neon/pull/7493)
- After https://github.com/neondatabase/neon/pull/7493, spawning lots of
reconcilers will block some other operations, e.g. a tenant creation
might fail due to lack of reconciler semaphore units while the
controller is busy running all the Reconcilers for its startup compute
notifications.

When the code was first written, ComputeHook didn't have internal
ordering logic to ensure that notifications for a shard were sent in the
right order. Since that was added in
https://github.com/neondatabase/neon/pull/7088, we can use it to avoid
waiting for notifications to complete in startup_reconcile.

Related to: https://github.com/neondatabase/neon/issues/7460

## Summary of changes

- Add a `notify_background` method to ComputeHook.
- Call this from startup_reconcile instead of doing notifications inline
- Process completions from `notify_background` in `process_results`, and
if a notification failed then set the `pending_compute_notification`
flag on the shard.

The result is that we will only spawn lots of Reconcilers if the compute
notifications _fail_, not just because they take some significant amount
of time.

Test coverage for this case is in
https://github.com/neondatabase/neon/pull/7475
---
 storage_controller/src/compute_hook.rs | 158 +++++++++++++++++++------
 storage_controller/src/service.rs      | 158 ++++++++++---------------
 2 files changed, 183 insertions(+), 133 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 1ed8998713..44a156a5ec 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -3,11 +3,13 @@ use std::{collections::HashMap, time::Duration};
 
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
+use futures::StreamExt;
 use hyper::{Method, StatusCode};
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
+use tracing::{info_span, Instrument};
 use utils::{
     backoff::{self},
     id::{NodeId, TenantId},
@@ -420,48 +422,37 @@ impl ComputeHook {
         .and_then(|x| x)
     }
 
-    /// Call this to notify the compute (postgres) tier of new pageservers to use
-    /// for a tenant.  notify() is called by each shard individually, and this function
-    /// will decide whether an update to the tenant is sent.  An update is sent on the
-    /// condition that:
-    /// - We know a pageserver for every shard.
-    /// - All the shards have the same shard_count (i.e. we are not mid-split)
-    ///
-    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
-    /// that is cancelled.
-    ///
-    /// This function is fallible, including in the case that the control plane is transiently
-    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
-    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
-    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
-    /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
-    pub(super) async fn notify(
+    /// Synchronous phase: update the per-tenant state for the next intended notification
+    fn notify_prepare(
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
         stripe_size: ShardStripeSize,
+    ) -> MaybeSendResult {
+        let mut state_locked = self.state.lock().unwrap();
+
+        use std::collections::hash_map::Entry;
+        let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
+        };
+        tenant.maybe_send(tenant_shard_id.tenant_id, None)
+    }
+
+    async fn notify_execute(
+        &self,
+        maybe_send_result: MaybeSendResult,
+        tenant_shard_id: TenantShardId,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
-            let mut state_locked = self.state.lock().unwrap();
-
-            use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
-                    stripe_size,
-                    node_id,
-                )),
-                Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
-                }
-            };
-            tenant.maybe_send(tenant_shard_id.tenant_id, None)
-        };
-
         // Process result: we may get an update to send, or we may have to wait for a lock
         // before trying again.
         let (request, mut send_lock_guard) = match maybe_send_result {
@@ -469,7 +460,12 @@ impl ComputeHook {
                 return Ok(());
             }
             MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = send_lock.lock_owned().await;
+                let send_locked = tokio::select! {
+                    guard = send_lock.lock_owned() => {guard},
+                    _ = cancel.cancelled() => {
+                        return Err(NotifyError::ShuttingDown)
+                    }
+                };
 
                 // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
                 // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
@@ -508,6 +504,94 @@ impl ComputeHook {
         }
         result
     }
+
+    /// Infallible synchronous fire-and-forget version of notify(), that sends its results to
+    /// a channel.  Something should consume the channel and arrange to try notifying again
+    /// if something failed.
+    pub(super) fn notify_background(
+        self: &Arc<Self>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
+        cancel: &CancellationToken,
+    ) {
+        let mut maybe_sends = Vec::new();
+        for (tenant_shard_id, node_id, stripe_size) in notifications {
+            let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+            maybe_sends.push((tenant_shard_id, maybe_send_result))
+        }
+
+        let this = self.clone();
+        let cancel = cancel.clone();
+
+        tokio::task::spawn(async move {
+            // Construct an async stream of futures to invoke the compute notify function: we do this
+            // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.  The
+            // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which
+            // would mostly just be waiting on that semaphore.
+            let mut stream = futures::stream::iter(maybe_sends)
+                .map(|(tenant_shard_id, maybe_send_result)| {
+                    let this = this.clone();
+                    let cancel = cancel.clone();
+
+                    async move {
+                        this
+                            .notify_execute(maybe_send_result, tenant_shard_id, &cancel)
+                            .await.map_err(|e| (tenant_shard_id, e))
+                    }.instrument(info_span!(
+                        "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
+                    ))
+                })
+                .buffered(API_CONCURRENCY);
+
+            loop {
+                tokio::select! {
+                    next = stream.next() => {
+                        match next {
+                            Some(r) => {
+                                result_tx.send(r).await.ok();
+                            },
+                            None => {
+                                tracing::info!("Finished sending background compute notifications");
+                                break;
+                            }
+                        }
+                    },
+                    _ = cancel.cancelled() => {
+                        tracing::info!("Shutdown while running background compute notifications");
+                        break;
+                    }
+                };
+            }
+        });
+    }
+
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+        stripe_size: ShardStripeSize,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+        self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
+            .await
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index df1008b64e..882562d99f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8,6 +8,7 @@ use std::{
 };
 
 use crate::{
+    compute_hook::NotifyError,
     id_lock_map::IdLockMap,
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
@@ -61,7 +62,7 @@ use utils::{
 };
 
 use crate::{
-    compute_hook::{self, ComputeHook},
+    compute_hook::ComputeHook,
     heartbeater::{Heartbeater, PageserverState},
     node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
@@ -332,7 +333,12 @@ impl Service {
     /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
     /// view of the world, and determine which pageservers are responsive.
     #[instrument(skip_all)]
-    async fn startup_reconcile(self: &Arc<Service>) {
+    async fn startup_reconcile(
+        self: &Arc<Service>,
+        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
+            Result<(), (TenantShardId, NotifyError)>,
+        >,
+    ) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
@@ -351,10 +357,6 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
-        let compute_notify_deadline = start_at
-            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
-            .expect("Reconcile timeout is a modest constant");
-
         // Accumulate a list of any tenant locations that ought to be detached
         let mut cleanup = Vec::new();
 
@@ -380,6 +382,7 @@ impl Service {
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
+        tracing::info!("Populating tenant shards' states from initial pageserver scan...");
         let shard_count = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -446,28 +449,27 @@ impl Service {
         // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
         // will emit compute hook notifications when they reconcile.
         //
-        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
-        // tenants named here, because otherwise our calls to notify() might race with more recent values
-        // generated by reconciliation.
-        let notify_failures = self
-            .compute_notify_many(compute_notifications, compute_notify_deadline)
-            .await;
-
-        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
-        // flag on these shards that they have a pending notification.
-        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
-        {
-            let mut locked = self.inner.write().unwrap();
-            for tenant_shard_id in notify_failures.into_iter() {
-                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                    shard.pending_compute_notification = true;
-                }
-            }
-        }
+        // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later
+        // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later
+        // calls will be correctly ordered wrt these.
+        //
+        // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them
+        // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore
+        // unit and start doing I/O.
+        tracing::info!(
+            "Sending {} compute notifications",
+            compute_notifications.len()
+        );
+        self.compute_hook.notify_background(
+            compute_notifications,
+            bg_compute_notify_result_tx.clone(),
+            &self.cancel,
+        );
 
         // Finally, now that the service is up and running, launch reconcile operations for any tenants
         // which require it: under normal circumstances this should only include tenants that were in some
         // transient state before we restarted, or any tenants whose compute hooks failed above.
+        tracing::info!("Checking for shards in need of reconciliation...");
         let reconcile_tasks = self.reconcile_all();
         // We will not wait for these reconciliation tasks to run here: we're now done with startup and
         // normal operations may proceed.
@@ -508,6 +510,7 @@ impl Service {
             }
         }
 
+        tracing::info!("Sending initial heartbeats...");
         let res = self
             .heartbeater
             .heartbeat(Arc::new(nodes_to_heartbeat))
@@ -544,6 +547,7 @@ impl Service {
 
         let mut node_list_futs = FuturesUnordered::new();
 
+        tracing::info!("Scanning shards on {} nodes...", nodes.len());
         for node in nodes.values() {
             node_list_futs.push({
                 async move {
@@ -663,72 +667,6 @@ impl Service {
         }
     }
 
-    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
-    ///
-    /// Returns a set of any shards for which notifications where not acked within the deadline.
-    async fn compute_notify_many(
-        &self,
-        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
-        deadline: Instant,
-    ) -> HashSet<TenantShardId> {
-        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
-        let mut success_shards = HashSet::new();
-
-        // Construct an async stream of futures to invoke the compute notify function: we do this
-        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
-        let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id, stripe_size)| {
-                let compute_hook = self.compute_hook.clone();
-                let cancel = self.cancel.clone();
-                async move {
-                    if let Err(e) = compute_hook
-                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
-                        .await
-                    {
-                        tracing::error!(
-                            %tenant_shard_id,
-                            %node_id,
-                            "Failed to notify compute on startup for shard: {e}"
-                        );
-                        None
-                    } else {
-                        Some(tenant_shard_id)
-                    }
-                }
-            })
-            .buffered(compute_hook::API_CONCURRENCY);
-
-        loop {
-            tokio::select! {
-                next = stream.next() => {
-                    match next {
-                        Some(Some(success_shard)) => {
-                            // A notification succeeded
-                            success_shards.insert(success_shard);
-                            },
-                        Some(None) => {
-                            // A notification that failed
-                        },
-                        None => {
-                            tracing::info!("Successfully sent all compute notifications");
-                            break;
-                        }
-                    }
-                },
-                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
-                    // Give up sending any that didn't succeed yet
-                    tracing::info!("Reached deadline while sending compute notifications");
-                    break;
-                }
-            };
-        }
-
-        attempt_shards
-            .difference(&success_shards)
-            .cloned()
-            .collect()
-    }
-
     /// Long running background task that periodically wakes up and looks for shards that need
     /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
     /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
@@ -887,23 +825,45 @@ impl Service {
     async fn process_results(
         &self,
         mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+        mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver<
+            Result<(), (TenantShardId, NotifyError)>,
+        >,
     ) {
         loop {
             // Wait for the next result, or for cancellation
-            let result = tokio::select! {
+            tokio::select! {
                 r = result_rx.recv() => {
                     match r {
-                        Some(result) => {result},
+                        Some(result) => {self.process_result(result);},
                         None => {break;}
                     }
                 }
+                _ = async{
+                    match bg_compute_hook_result_rx.recv().await {
+                        Some(result) => {
+                            if let Err((tenant_shard_id, notify_error)) = result {
+                                tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}");
+                                let mut locked = self.inner.write().unwrap();
+                                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                                    shard.pending_compute_notification = true;
+                                }
+
+                            }
+                        },
+                        None => {
+                            // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown
+                            self.cancel.cancelled().await;
+                        }
+                    }
+                } => {},
                 _ = self.cancel.cancelled() => {
                     break;
                 }
             };
-
-            self.process_result(result);
         }
+
+        // We should only fall through on shutdown
+        assert!(self.cancel.is_cancelled());
     }
 
     async fn process_aborts(
@@ -1064,6 +1024,10 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        // This channel is continuously consumed by process_results, so doesn't need to be very large.
+        let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) =
+            tokio::sync::mpsc::channel(512);
+
         let (delayed_reconcile_tx, delayed_reconcile_rx) =
             tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);
 
@@ -1101,7 +1065,9 @@ impl Service {
         tokio::task::spawn(async move {
             // Block shutdown until we're done (we must respect self.cancel)
             if let Ok(_gate) = result_task_this.gate.enter() {
-                result_task_this.process_results(result_rx).await
+                result_task_this
+                    .process_results(result_rx, bg_compute_notify_result_rx)
+                    .await
             }
         });
 
@@ -1143,7 +1109,7 @@ impl Service {
                     return;
                 };
 
-                this.startup_reconcile().await;
+                this.startup_reconcile(bg_compute_notify_result_tx).await;
                 drop(startup_completion);
             }
         });

From 24ce878039fbf7b45b18cbcf4c7617b779338d2e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:49:42 +0200
Subject: [PATCH 099/157] proxy: Exclude compute and retries (#7529)

## Problem

Alerts fire if the connection the compute is slow.

## Summary of changes

Exclude compute and retry from latencies.
---
 proxy/src/compute.rs               |  4 ++++
 proxy/src/metrics.rs               | 33 ++++++++++++++++++++++++++++++
 proxy/src/proxy/connect_compute.rs |  4 ++++
 proxy/src/proxy/wake_compute.rs    |  4 ++++
 proxy/src/serverless/backend.rs    |  2 ++
 5 files changed, 47 insertions(+)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 149a619316..44d85c2952 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -260,7 +260,9 @@ impl ConnCfg {
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
+        drop(pause);
 
         let tls_connector = native_tls::TlsConnector::builder()
             .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -270,7 +272,9 @@ impl ConnCfg {
         let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        drop(pause);
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 530350008c..c129ece059 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -284,6 +284,8 @@ pub struct ComputeConnectionLatencyGroup {
 pub enum LatencyExclusions {
     Client,
     ClientAndCplane,
+    ClientCplaneCompute,
+    ClientCplaneComputeRetry,
 }
 
 #[derive(FixedCardinalityLabel, Copy, Clone)]
@@ -352,6 +354,7 @@ pub enum Waiting {
     Cplane,
     Client,
     Compute,
+    RetryTimeout,
 }
 
 #[derive(Default)]
@@ -359,6 +362,7 @@ struct Accumulated {
     cplane: time::Duration,
     client: time::Duration,
     compute: time::Duration,
+    retry: time::Duration,
 }
 
 pub struct LatencyTimer {
@@ -421,6 +425,7 @@ impl Drop for LatencyTimerPause<'_> {
             Waiting::Cplane => self.timer.accumulated.cplane += dur,
             Waiting::Client => self.timer.accumulated.client += dur,
             Waiting::Compute => self.timer.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
         }
     }
 }
@@ -464,6 +469,34 @@ impl Drop for LatencyTimer {
             },
             duration.saturating_sub(accumulated_total).as_secs_f64(),
         );
+
+        // Exclude client cplane, compue communication from the accumulated time.
+        let accumulated_total =
+            self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientCplaneCompute,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
+
+        // Exclude client cplane, compue, retry communication from the accumulated time.
+        let accumulated_total = self.accumulated.client
+            + self.accumulated.cplane
+            + self.accumulated.compute
+            + self.accumulated.retry;
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientCplaneComputeRetry,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
     }
 }
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8a220aaa0c..f561085588 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -194,6 +194,10 @@ where
         let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
         time::sleep(wait_duration).await;
+        drop(pause);
     }
 }
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index cfedf0e98a..cb9a293413 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -54,7 +54,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 
         let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
         tokio::time::sleep(wait_duration).await;
+        drop(pause);
     }
 }
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b91c0e62ed..c89ebc3251 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -179,7 +179,9 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        drop(pause);
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(

From 2226acef7ca147276dab2bc3eea94958fbc03036 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 13:16:00 +0100
Subject: [PATCH 100/157] s3_scrubber: add `tenant-snapshot` (#7444)

## Problem

Downloading tenant data for analysis/debug with `aws s3 cp` works well
for small tenants, but for larger tenants it is unlikely that one ends
up with an index that matches layer files, due to the time taken to
download.

## Summary of changes

- Add a `tenant-snapshot` command to the scrubber, which reads timeline
indices and then downloads the layers referenced in the index, even if
they were deleted. The result is a snapshot of the tenant's remote
storage state that should be usable when imported (#7399 ).
---
 Cargo.lock                              |   1 +
 s3_scrubber/Cargo.toml                  |   1 +
 s3_scrubber/src/lib.rs                  |  71 +++++-
 s3_scrubber/src/main.rs                 |  21 ++
 s3_scrubber/src/metadata_stream.rs      |  58 ++++-
 s3_scrubber/src/tenant_snapshot.rs      | 293 ++++++++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py   |  31 ++-
 test_runner/fixtures/remote_storage.py  |   8 +-
 test_runner/fixtures/types.py           |   6 +-
 test_runner/regress/test_s3_scrubber.py | 111 +++++++++
 10 files changed, 586 insertions(+), 15 deletions(-)
 create mode 100644 s3_scrubber/src/tenant_snapshot.rs
 create mode 100644 test_runner/regress/test_s3_scrubber.py

diff --git a/Cargo.lock b/Cargo.lock
index 85a59ec0ed..a130988409 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5085,6 +5085,7 @@ dependencies = [
  "aws-smithy-async",
  "bincode",
  "bytes",
+ "camino",
  "chrono",
  "clap",
  "crc32c",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 4d136472e0..0ee9112010 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -25,6 +25,7 @@ async-stream.workspace = true
 tokio-stream.workspace = true
 futures-util.workspace = true
 itertools.workspace = true
+camino.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d2842877d0..90d58a3bc2 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -5,6 +5,7 @@ pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
 pub mod scan_metadata;
+pub mod tenant_snapshot;
 
 use std::env;
 use std::fmt::Display;
@@ -23,17 +24,18 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
 use aws_smithy_async::rt::sleep::TokioSleep;
 
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use std::io::IsTerminal;
 use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::id::TimelineId;
+use utils::fs_ext;
+use utils::id::{TenantId, TimelineId};
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -147,6 +149,23 @@ impl RootTarget {
         self.tenants_root().with_sub_segment(&tenant_id.to_string())
     }
 
+    pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
+        // Only pageserver remote storage contains tenant-shards
+        assert!(matches!(self, Self::Pageserver(_)));
+        let Self::Pageserver(root) = self else {
+            panic!();
+        };
+
+        S3Target {
+            bucket_name: root.bucket_name.clone(),
+            prefix_in_bucket: format!(
+                "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}",
+                root.prefix_in_bucket
+            ),
+            delimiter: root.delimiter.clone(),
+        }
+    }
+
     pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
         match self {
             Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
@@ -240,7 +259,6 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
         .with_ansi(false)
         .with_writer(file_writer);
     let stderr_logs = fmt::Layer::new()
-        .with_ansi(std::io::stderr().is_terminal())
         .with_target(false)
         .with_writer(std::io::stderr);
     tracing_subscriber::registry()
@@ -396,3 +414,50 @@ async fn download_object_with_retries(
 
     anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
+
+async fn download_object_to_file(
+    s3_client: &Client,
+    bucket_name: &str,
+    key: &str,
+    version_id: Option<&str>,
+    local_path: &Utf8Path,
+) -> anyhow::Result<()> {
+    let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp"));
+    for _ in 0..MAX_RETRIES {
+        tokio::fs::remove_file(&tmp_path)
+            .await
+            .or_else(fs_ext::ignore_not_found)?;
+
+        let mut file = tokio::fs::File::create(&tmp_path)
+            .await
+            .context("Opening output file")?;
+
+        let request = s3_client.get_object().bucket(bucket_name).key(key);
+
+        let request = match version_id {
+            Some(version_id) => request.version_id(version_id),
+            None => request,
+        };
+
+        let response_stream = match request.send().await {
+            Ok(response) => response,
+            Err(e) => {
+                error!(
+                    "Failed to download object for key {key} version {}: {e:#}",
+                    version_id.unwrap_or("")
+                );
+                tokio::time::sleep(Duration::from_secs(1)).await;
+                continue;
+            }
+        };
+
+        let mut read_stream = response_stream.body.into_async_read();
+
+        tokio::io::copy(&mut read_stream, &mut file).await?;
+
+        tokio::fs::rename(&tmp_path, local_path).await?;
+        return Ok(());
+    }
+
+    anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
+}
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index 957213856b..88ba9bfa61 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,9 +1,12 @@
+use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::tenant_snapshot::SnapshotDownloader;
 use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
 
 use clap::{Parser, Subcommand};
+use utils::id::TenantId;
 
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -38,6 +41,14 @@ enum Command {
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
     },
+    TenantSnapshot {
+        #[arg(long = "tenant-id")]
+        tenant_id: TenantId,
+        #[arg(long = "concurrency", short = 'j', default_value_t = 8)]
+        concurrency: usize,
+        #[arg(short, long)]
+        output_path: Utf8PathBuf,
+    },
 }
 
 #[tokio::main]
@@ -50,6 +61,7 @@ async fn main() -> anyhow::Result<()> {
         Command::ScanMetadata { .. } => "scan",
         Command::FindGarbage { .. } => "find-garbage",
         Command::PurgeGarbage { .. } => "purge-garbage",
+        Command::TenantSnapshot { .. } => "tenant-snapshot",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -102,5 +114,14 @@ async fn main() -> anyhow::Result<()> {
         Command::PurgeGarbage { input_path, mode } => {
             purge_garbage(input_path, mode, !cli.delete).await
         }
+        Command::TenantSnapshot {
+            tenant_id,
+            output_path,
+            concurrency,
+        } => {
+            let downloader =
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+            downloader.download().await
+        }
     }
 }
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index 073f37f319..b192e0be2e 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -5,7 +5,7 @@ use tokio_stream::Stream;
 
 use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
 use pageserver_api::shard::TenantShardId;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
@@ -45,6 +45,62 @@ pub fn stream_tenants<'a>(
     }
 }
 
+pub async fn stream_tenant_shards<'a>(
+    s3_client: &'a Client,
+    target: &'a RootTarget,
+    tenant_id: TenantId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
+    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
+    let mut continuation_token = None;
+    let shards_target = target.tenant_shards_prefix(&tenant_id);
+
+    loop {
+        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
+        let fetch_response =
+            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
+        let fetch_response = match fetch_response {
+            Err(e) => {
+                tenant_shard_ids.push(Err(e));
+                break;
+            }
+            Ok(r) => r,
+        };
+
+        let new_entry_ids = fetch_response
+            .common_prefixes()
+            .iter()
+            .filter_map(|prefix| prefix.prefix())
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                let first_part = entry_id_str.split('/').next().unwrap();
+
+                first_part
+                    .parse::<TenantShardId>()
+                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
+            });
+
+        for i in new_entry_ids {
+            tenant_shard_ids.push(i);
+        }
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    Ok(stream! {
+        for i in tenant_shard_ids {
+            let id = i?;
+            yield Ok(id);
+        }
+    })
+}
+
 /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
new file mode 100644
index 0000000000..4eccad381b
--- /dev/null
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -0,0 +1,293 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::{
+    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+use anyhow::Context;
+use async_stream::stream;
+use aws_sdk_s3::Client;
+use camino::Utf8PathBuf;
+use futures::{StreamExt, TryStreamExt};
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
+use utils::generation::Generation;
+use utils::id::TenantId;
+
+pub struct SnapshotDownloader {
+    s3_client: Arc<Client>,
+    s3_root: RootTarget,
+    bucket_config: BucketConfig,
+    tenant_id: TenantId,
+    output_path: Utf8PathBuf,
+    concurrency: usize,
+}
+
+impl SnapshotDownloader {
+    pub fn new(
+        bucket_config: BucketConfig,
+        tenant_id: TenantId,
+        output_path: Utf8PathBuf,
+        concurrency: usize,
+    ) -> anyhow::Result<Self> {
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        Ok(Self {
+            s3_client,
+            s3_root,
+            bucket_config,
+            tenant_id,
+            output_path,
+            concurrency,
+        })
+    }
+
+    async fn download_layer(
+        &self,
+        ttid: TenantShardTimelineId,
+        layer_name: LayerFileName,
+        layer_metadata: IndexLayerMetadata,
+    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
+        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
+        // different layer names (remote-style has the generation suffix)
+        let local_path = self.output_path.join(format!(
+            "{}/timelines/{}/{}{}",
+            ttid.tenant_shard_id,
+            ttid.timeline_id,
+            layer_name.file_name(),
+            layer_metadata.generation.get_suffix()
+        ));
+
+        // We should only be called for layers that are owned by the input TTID
+        assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index());
+
+        // Assumption: we always write layer files atomically, and layer files are immutable.  Therefore if the file
+        // already exists on local disk, we assume it is fully correct and skip it.
+        if tokio::fs::try_exists(&local_path).await? {
+            tracing::debug!("{} already exists", local_path);
+            return Ok((layer_name, layer_metadata));
+        } else {
+            tracing::debug!("{} requires download...", local_path);
+
+            let timeline_root = self.s3_root.timeline_root(&ttid);
+            let remote_layer_path = format!(
+                "{}{}{}",
+                timeline_root.prefix_in_bucket,
+                layer_name.file_name(),
+                layer_metadata.generation.get_suffix()
+            );
+
+            // List versions: the object might be deleted.
+            let versions = self
+                .s3_client
+                .list_object_versions()
+                .bucket(self.bucket_config.bucket.clone())
+                .prefix(&remote_layer_path)
+                .send()
+                .await?;
+            let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
+                return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
+            };
+            download_object_to_file(
+                &self.s3_client,
+                &self.bucket_config.bucket,
+                &remote_layer_path,
+                version.version_id.as_deref(),
+                &local_path,
+            )
+            .await?;
+
+            tracing::debug!("Downloaded successfully to {local_path}");
+        }
+
+        Ok((layer_name, layer_metadata))
+    }
+
+    /// Download many layers belonging to the same TTID, with some concurrency
+    async fn download_layers(
+        &self,
+        ttid: TenantShardTimelineId,
+        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
+    ) -> anyhow::Result<()> {
+        let layer_count = layers.len();
+        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
+        let layers_stream = stream! {
+            for (layer_name, layer_metadata) in layers {
+                yield self.download_layer(ttid, layer_name, layer_metadata);
+            }
+        };
+
+        tokio::fs::create_dir_all(self.output_path.join(format!(
+            "{}/timelines/{}",
+            ttid.tenant_shard_id, ttid.timeline_id
+        )))
+        .await?;
+
+        let layer_results = layers_stream.buffered(self.concurrency);
+        let mut layer_results = std::pin::pin!(layer_results);
+
+        let mut err = None;
+        let mut download_count = 0;
+        while let Some(i) = layer_results.next().await {
+            download_count += 1;
+            match i {
+                Ok((layer_name, layer_metadata)) => {
+                    tracing::info!(
+                        "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
+                        layer_metadata.file_size,
+                        layer_name.file_name()
+                    );
+                }
+                Err(e) => {
+                    // Warn and continue: we will download what we can
+                    tracing::warn!("Download error: {e}");
+                    err = Some(e);
+                }
+            }
+        }
+        if let Some(e) = err {
+            tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}");
+            Err(e)
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn download_timeline(
+        &self,
+        ttid: TenantShardTimelineId,
+        index_part: IndexPart,
+        index_part_generation: Generation,
+        ancestor_layers: &mut HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerFileName, IndexLayerMetadata>,
+        >,
+    ) -> anyhow::Result<()> {
+        let index_bytes = serde_json::to_string(&index_part).unwrap();
+
+        let layers = index_part
+            .layer_metadata
+            .into_iter()
+            .filter_map(|(layer_name, layer_metadata)| {
+                if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count {
+                    // Accumulate ancestor layers for later download
+                    let ancestor_ttid = TenantShardTimelineId::new(
+                        TenantShardId {
+                            tenant_id: ttid.tenant_shard_id.tenant_id,
+                            shard_number: layer_metadata.shard.shard_number,
+                            shard_count: layer_metadata.shard.shard_count,
+                        },
+                        ttid.timeline_id,
+                    );
+                    let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default();
+                    use std::collections::hash_map::Entry;
+                    match ancestor_ttid_layers.entry(layer_name) {
+                        Entry::Occupied(entry) => {
+                            // Descendent shards that reference a layer from an ancestor should always have matching metadata,
+                            // as their siblings, because it is read atomically during a shard split.
+                            assert_eq!(entry.get(), &layer_metadata);
+                        }
+                        Entry::Vacant(entry) => {
+                            entry.insert(layer_metadata);
+                        }
+                    }
+                    None
+                } else {
+                    Some((layer_name, layer_metadata))
+                }
+            })
+            .collect();
+
+        let download_result = self.download_layers(ttid, layers).await;
+
+        // Write index last, once all the layers it references are downloaded
+        let local_index_path = self.output_path.join(format!(
+            "{}/timelines/{}/index_part.json{}",
+            ttid.tenant_shard_id,
+            ttid.timeline_id,
+            index_part_generation.get_suffix()
+        ));
+        tokio::fs::write(&local_index_path, index_bytes)
+            .await
+            .context("writing index")?;
+
+        download_result
+    }
+
+    pub async fn download(&self) -> anyhow::Result<()> {
+        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+
+        // Generate a stream of TenantShardId
+        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
+        let shards: Vec<TenantShardId> = shards.try_collect().await?;
+
+        // Only read from shards that have the highest count: avoids redundantly downloading
+        // from ancestor shards.
+        let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else {
+            anyhow::bail!("No shards found");
+        };
+
+        // We will build a collection of layers in anccestor shards to download (this will only
+        // happen if this tenant has been split at some point)
+        let mut ancestor_layers: HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerFileName, IndexLayerMetadata>,
+        > = Default::default();
+
+        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
+            // Generate a stream of TenantTimelineId
+            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
+
+            // Generate a stream of S3TimelineBlobData
+            async fn load_timeline_index(
+                s3_client: &Client,
+                target: &RootTarget,
+                ttid: TenantShardTimelineId,
+            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
+                let data = list_timeline_blobs(s3_client, ttid, target).await?;
+                Ok((ttid, data))
+            }
+            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
+            let mut timelines = std::pin::pin!(timelines.try_buffered(8));
+
+            while let Some(i) = timelines.next().await {
+                let (ttid, data) = i?;
+                match data.blob_data {
+                    BlobDataParseResult::Parsed {
+                        index_part,
+                        index_part_generation,
+                        s3_layers: _,
+                    } => {
+                        self.download_timeline(
+                            ttid,
+                            index_part,
+                            index_part_generation,
+                            &mut ancestor_layers,
+                        )
+                        .await
+                        .context("Downloading timeline")?;
+                    }
+                    BlobDataParseResult::Relic => {}
+                    BlobDataParseResult::Incorrect(_) => {
+                        tracing::error!("Bad metadata in timeline {ttid}");
+                    }
+                };
+            }
+        }
+
+        for (ttid, layers) in ancestor_layers.into_iter() {
+            tracing::info!(
+                "Downloading {} layers from ancvestor timeline {ttid}...",
+                layers.len()
+            );
+
+            self.download_layers(ttid, layers.into_iter().collect())
+                .await?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a94732a682..07db355d98 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2310,20 +2310,24 @@ class NeonPageserver(PgProtocol):
         # The entries in the list are regular experessions.
         self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
 
-    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
+    def timeline_dir(
+        self,
+        tenant_shard_id: Union[TenantId, TenantShardId],
+        timeline_id: Optional[TimelineId] = None,
+    ) -> Path:
         """Get a timeline directory's path based on the repo directory of the test environment"""
         if timeline_id is None:
-            return self.tenant_dir(tenant_id) / "timelines"
-        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+            return self.tenant_dir(tenant_shard_id) / "timelines"
+        return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id)
 
     def tenant_dir(
         self,
-        tenant_id: Optional[TenantId] = None,
+        tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None,
     ) -> Path:
         """Get a tenant directory's path based on the repo directory of the test environment"""
-        if tenant_id is None:
+        if tenant_shard_id is None:
             return self.workdir / "tenants"
-        return self.workdir / "tenants" / str(tenant_id)
+        return self.workdir / "tenants" / str(tenant_shard_id)
 
     def start(
         self,
@@ -2510,8 +2514,10 @@ class NeonPageserver(PgProtocol):
         client = self.http_client()
         return client.tenant_location_conf(tenant_id, config, **kwargs)
 
-    def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
-        path = self.tenant_dir(tenant_id) / "config-v1"
+    def read_tenant_location_conf(
+        self, tenant_shard_id: Union[TenantId, TenantShardId]
+    ) -> dict[str, Any]:
+        path = self.tenant_dir(tenant_shard_id) / "config-v1"
         log.info(f"Reading location conf from {path}")
         bytes = open(path, "r").read()
         try:
@@ -3715,7 +3721,7 @@ class S3Scrubber:
             log.warning(f"Scrub environment: {env}")
             log.warning(f"Output at: {output_path}")
 
-            raise RuntimeError("Remote storage scrub failed")
+            raise RuntimeError(f"Scrubber failed while running {args}")
 
         assert stdout is not None
         return stdout
@@ -3730,6 +3736,13 @@ class S3Scrubber:
             log.error(stdout)
             raise
 
+    def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
+        stdout = self.scrubber_cli(
+            ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
+            timeout=30,
+        )
+        log.info(f"tenant-snapshot output: {stdout}")
+
 
 def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
     """Compute the path to a working directory for an individual test."""
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 60591d8d46..83f9f26837 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,8 +252,11 @@ class S3Storage:
 
         log.info(f"deleted {cnt} objects from remote storage")
 
+    def tenants_path(self) -> str:
+        return f"{self.prefix_in_bucket}/tenants"
+
     def tenant_path(self, tenant_id: TenantId) -> str:
-        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
+        return f"{self.tenants_path()}/{tenant_id}"
 
     def heatmap_key(self, tenant_id: TenantId) -> str:
         return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
@@ -262,6 +265,9 @@ class S3Storage:
         r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
         return json.loads(r["Body"].read().decode("utf-8"))
 
+    def mock_remote_tenant_path(self, tenant_id: TenantId):
+        assert self.real is False
+
 
 RemoteStorage = Union[LocalFsStorage, S3Storage]
 
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index 80c9b9ce9a..b5458b5c26 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -156,7 +156,11 @@ class TenantShardId:
             raise ValueError(f"Invalid TenantShardId '{input}'")
 
     def __str__(self):
-        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+        if self.shard_count > 0:
+            return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+        else:
+            # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id)
+            return str(self.tenant_id)
 
     def __repr__(self):
         return self.__str__()
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
new file mode 100644
index 0000000000..018c1637d0
--- /dev/null
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -0,0 +1,111 @@
+import os
+import shutil
+from typing import Optional
+
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    S3Scrubber,
+)
+from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.types import TenantShardId
+from fixtures.workload import Workload
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    """
+    Test the `tenant-snapshot` subcommand, which grabs data from remote storage
+
+    This is only a support/debug tool, but worth testing to ensure the tool does not regress.
+    """
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
+
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    branch = "main"
+
+    # Do some work
+    workload = Workload(env, tenant_id, timeline_id, branch)
+    workload.init()
+
+    # Multiple write/flush passes to generate multiple layers
+    for _n in range(0, 3):
+        workload.write_rows(128)
+
+    # Do some more work after a restart, so that we have multiple generations
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    for _n in range(0, 3):
+        workload.write_rows(128)
+
+    # If we're doing multiple shards, split: this is important to exercise
+    # the scrubber's ability to understand the references from child shards to parent shard's layers
+    if shard_count is not None:
+        tenant_shard_ids = env.storage_controller.tenant_shard_split(
+            tenant_id, shard_count=shard_count
+        )
+
+        # Write after shard split: this will result in shards containing a mixture of owned
+        # and parent layers in their index.
+        workload.write_rows(128)
+    else:
+        tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
+
+    output_path = neon_env_builder.test_output_dir / "snapshot"
+    os.makedirs(output_path)
+
+    scrubber = S3Scrubber(neon_env_builder)
+    scrubber.tenant_snapshot(tenant_id, output_path)
+
+    assert len(os.listdir(output_path)) > 0
+
+    workload.stop()
+
+    # Stop pageservers
+    for pageserver in env.pageservers:
+        pageserver.stop()
+
+    # Drop all shards' local storage
+    for tenant_shard_id in tenant_shard_ids:
+        pageserver = env.get_tenant_pageserver(tenant_shard_id)
+        shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id))
+
+    # Replace remote storage contents with the snapshot we downloaded
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
+
+    remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id)
+
+    # Delete current remote storage contents
+    bucket = env.pageserver_remote_storage.bucket_name
+    remote_client = env.pageserver_remote_storage.client
+    deleted = 0
+    for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[
+        "Contents"
+    ]:
+        key = object["Key"]
+        remote_client.delete_object(Key=key, Bucket=bucket)
+        deleted += 1
+    assert deleted > 0
+
+    # Upload from snapshot
+    for root, _dirs, files in os.walk(output_path):
+        for file in files:
+            full_local_path = os.path.join(root, file)
+            full_remote_path = (
+                env.pageserver_remote_storage.tenants_path()
+                + "/"
+                + full_local_path.removeprefix(f"{output_path}/")
+            )
+            remote_client.upload_file(full_local_path, bucket, full_remote_path)
+
+    for pageserver in env.pageservers:
+        pageserver.start()
+
+    # Check we can read everything
+    workload.validate()

From 90cadfa986327d6ae29bfef32a6a60d67f19c845 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:26:21 +0200
Subject: [PATCH 101/157] proxy: Adjust retry wake compute (#7537)

## Problem

Right now we always do retry wake compute.

## Summary of changes

Create a list of errors when we could avoid needless retries.
---
 proxy/src/proxy/connect_compute.rs |  9 +++++++-
 proxy/src/proxy/retry.rs           | 34 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index f561085588..da6223209f 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -133,10 +133,17 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info = if !node_info.cached() {
+    let node_info = if !node_info.cached() || !err.should_retry_database_address() {
         // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
         if !err.should_retry(num_retries, connect_to_compute_retry_config) {
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Failed,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
             return Err(err.into());
         }
         node_info
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 082e06caa3..36a05ba190 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -10,6 +10,9 @@ pub trait ShouldRetry {
             err => err.could_retry(),
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        true
+    }
 }
 
 impl ShouldRetry for io::Error {
@@ -33,6 +36,21 @@ impl ShouldRetry for tokio_postgres::error::DbError {
                 | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
         )
     }
+    fn should_retry_database_address(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        // Here are errors that happens after the user successfully authenticated to the database.
+        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
+        !matches!(
+            self.code(),
+            &SqlState::TOO_MANY_CONNECTIONS
+                | &SqlState::OUT_OF_MEMORY
+                | &SqlState::SYNTAX_ERROR
+                | &SqlState::T_R_SERIALIZATION_FAILURE
+                | &SqlState::INVALID_CATALOG_NAME
+                | &SqlState::INVALID_SCHEMA_NAME
+                | &SqlState::INVALID_PARAMETER_VALUE
+        )
+    }
 }
 
 impl ShouldRetry for tokio_postgres::Error {
@@ -45,6 +63,15 @@ impl ShouldRetry for tokio_postgres::Error {
             false
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::should_retry_database_address(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::should_retry_database_address(db_err)
+        } else {
+            true
+        }
+    }
 }
 
 impl ShouldRetry for compute::ConnectionError {
@@ -55,6 +82,13 @@ impl ShouldRetry for compute::ConnectionError {
             _ => false,
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
+            compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
+            _ => true,
+        }
+    }
 }
 
 pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {

From 1684bbf16255a5cffd06ca03d9abe1124745b964 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:22:13 +0200
Subject: [PATCH 102/157] proxy: Create disconnect events (#7535)

## Problem

It's not possible to get the duration of the session from proxy events.

## Summary of changes

* Added a separate events folder in s3, to record disconnect events.
* Disconnect events are exactly the same as normal events, but also have
`disconnect_timestamp` field not empty.
* @oruen suggested to fill it with the same information as the original
events to avoid potentially heavy joins.
---
 proxy/src/bin/pg_sni_router.rs    |   2 +-
 proxy/src/context.rs              |  33 ++++++++--
 proxy/src/context/parquet.rs      | 104 +++++++++++++++++++++---------
 proxy/src/proxy.rs                |   4 +-
 proxy/src/serverless/websocket.rs |   4 +-
 5 files changed, 102 insertions(+), 45 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 7a693002a8..fb16b76567 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -279,7 +279,7 @@ async fn handle_client(
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
-    ctx.log();
+    ctx.log_connect();
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 17b82c08aa..dfd3ef108e 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -20,7 +20,8 @@ use self::parquet::RequestData;
 
 pub mod parquet;
 
-static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
 
 /// Context data for a single request to connect to a database.
 ///
@@ -49,9 +50,12 @@ pub struct RequestMonitoring {
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
+    // This sender is only used to log the length of session in case of success.
+    disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
     // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
     rejected: Option<bool>,
+    disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }
 
 #[derive(Clone, Debug)]
@@ -100,7 +104,9 @@ impl RequestMonitoring {
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
+            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
+            disconnect_timestamp: None,
         }
     }
 
@@ -190,11 +196,7 @@ impl RequestMonitoring {
         self.success = true;
     }
 
-    pub fn log(self) {}
-}
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
+    pub fn log_connect(&mut self) {
         let outcome = if self.success {
             ConnectOutcome::Success
         } else {
@@ -226,4 +228,23 @@ impl Drop for RequestMonitoring {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
     }
+
+    fn log_disconnect(&mut self) {
+        // If we are here, it's guaranteed that the user successfully connected to the endpoint.
+        // Here we log the length of the session.
+        self.disconnect_timestamp = Some(Utc::now());
+        if let Some(tx) = self.disconnect_sender.take() {
+            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+        }
+    }
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
+        if self.sender.is_some() {
+            self.log_connect();
+        } else {
+            self.log_disconnect();
+        }
+    }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 9600321937..8104fe6087 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -19,7 +19,10 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
+use crate::{
+    config::{remote_storage_from_toml, OptRemoteStorageConfig},
+    context::LOG_CHAN_DISCONNECT,
+};
 
 use super::{RequestMonitoring, LOG_CHAN};
 
@@ -31,6 +34,9 @@ pub struct ParquetUploadArgs {
     #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
     parquet_upload_remote_storage: OptRemoteStorageConfig,
 
+    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
+    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
+
     /// How many rows to include in a row group
     #[clap(long, default_value_t = 8192)]
     parquet_upload_row_group_size: usize,
@@ -91,6 +97,8 @@ pub struct RequestData {
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
+    /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`.
+    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }
 
 impl From<&RequestMonitoring> for RequestData {
@@ -120,6 +128,7 @@ impl From<&RequestMonitoring> for RequestData {
                 .elapsed()
                 .unwrap_or_default()
                 .as_micros() as u64, // 584 millenia... good enough
+            disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()),
         }
     }
 }
@@ -141,8 +150,9 @@ pub async fn worker(
     LOG_CHAN.set(tx.downgrade()).unwrap();
 
     // setup row stream that will close on cancellation
+    let cancellation_token2 = cancellation_token.clone();
     tokio::spawn(async move {
-        cancellation_token.cancelled().await;
+        cancellation_token2.cancelled().await;
         // dropping this sender will cause the channel to close only once
         // all the remaining inflight requests have been completed.
         drop(tx);
@@ -167,9 +177,38 @@ pub async fn worker(
         test_remote_failures: 0,
     };
 
-    worker_inner(storage, rx, parquet_config).await
+    // TODO(anna): consider moving this to a separate function.
+    if let Some(disconnect_events_storage_config) =
+        config.parquet_upload_disconnect_events_remote_storage
+    {
+        let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel();
+        LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap();
+
+        // setup row stream that will close on cancellation
+        tokio::spawn(async move {
+            cancellation_token.cancelled().await;
+            // dropping this sender will cause the channel to close only once
+            // all the remaining inflight requests have been completed.
+            drop(tx_disconnect);
+        });
+        let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
+        let rx_disconnect = rx_disconnect.map(RequestData::from);
+
+        let storage_disconnect =
+            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .context("remote storage for disconnect events init")?;
+        let parquet_config_disconnect = parquet_config.clone();
+        tokio::try_join!(
+            worker_inner(storage, rx, parquet_config),
+            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+        )
+        .map(|_| ())
+    } else {
+        worker_inner(storage, rx, parquet_config).await
+    }
 }
 
+#[derive(Clone, Debug)]
 struct ParquetConfig {
     propeties: WriterPropertiesPtr,
     rows_per_group: usize,
@@ -452,6 +491,7 @@ mod tests {
             success: rng.gen(),
             cold_start_info: "no",
             duration_us: rng.gen_range(0..30_000_000),
+            disconnect_timestamp: None,
         }
     }
 
@@ -520,15 +560,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314385, 3, 6000),
-                (1314378, 3, 6000),
-                (1314438, 3, 6000),
-                (1314395, 3, 6000),
-                (1314525, 3, 6000),
-                (1314367, 3, 6000),
-                (1314159, 3, 6000),
-                (1314395, 3, 6000),
-                (438352, 1, 2000)
+                (1315008, 3, 6000),
+                (1315001, 3, 6000),
+                (1315061, 3, 6000),
+                (1315018, 3, 6000),
+                (1315148, 3, 6000),
+                (1314990, 3, 6000),
+                (1314782, 3, 6000),
+                (1315018, 3, 6000),
+                (438575, 1, 2000)
             ]
         );
 
@@ -558,11 +598,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220633, 5, 10000),
-                (1226783, 5, 10000),
-                (1228577, 5, 10000),
-                (1227939, 5, 10000),
-                (1219217, 5, 10000)
+                (1221738, 5, 10000),
+                (1227888, 5, 10000),
+                (1229682, 5, 10000),
+                (1229044, 5, 10000),
+                (1220322, 5, 10000)
             ]
         );
 
@@ -594,11 +634,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206280, 5, 10000),
-                (1206011, 5, 10000),
-                (1206304, 5, 10000),
-                (1206292, 5, 10000),
-                (1206547, 5, 10000)
+                (1207385, 5, 10000),
+                (1207116, 5, 10000),
+                (1207409, 5, 10000),
+                (1207397, 5, 10000),
+                (1207652, 5, 10000)
             ]
         );
 
@@ -623,15 +663,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314385, 3, 6000),
-                (1314378, 3, 6000),
-                (1314438, 3, 6000),
-                (1314395, 3, 6000),
-                (1314525, 3, 6000),
-                (1314367, 3, 6000),
-                (1314159, 3, 6000),
-                (1314395, 3, 6000),
-                (438352, 1, 2000)
+                (1315008, 3, 6000),
+                (1315001, 3, 6000),
+                (1315061, 3, 6000),
+                (1315018, 3, 6000),
+                (1315148, 3, 6000),
+                (1314990, 3, 6000),
+                (1314782, 3, 6000),
+                (1315018, 3, 6000),
+                (438575, 1, 2000)
             ]
         );
 
@@ -668,7 +708,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
+            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ddae6536fb..33d73eb675 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -132,16 +132,14 @@ pub async fn task_main(
                 Err(e) => {
                     // todo: log and push to ctx the error kind
                     ctx.set_error_kind(e.get_error_kind());
-                    ctx.log();
                     error!(parent: &span, "per-client task finished with an error: {e:#}");
                 }
                 Ok(None) => {
                     ctx.set_success();
-                    ctx.log();
                 }
                 Ok(Some(p)) => {
                     ctx.set_success();
-                    ctx.log();
+                    ctx.log_connect();
                     match p.proxy_pass().instrument(span.clone()).await {
                         Ok(()) => {}
                         Err(e) => {
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index eddd278b7d..b6cd85af73 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -156,17 +156,15 @@ pub async fn serve_websocket(
         Err(e) => {
             // todo: log and push to ctx the error kind
             ctx.set_error_kind(e.get_error_kind());
-            ctx.log();
             Err(e.into())
         }
         Ok(None) => {
             ctx.set_success();
-            ctx.log();
             Ok(())
         }
         Ok(Some(p)) => {
             ctx.set_success();
-            ctx.log();
+            ctx.log_connect();
             p.proxy_pass().await
         }
     }

From 1f417af9fd7e43de192dcd536d1ff0bab5b85f80 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 29 Apr 2024 17:26:35 +0100
Subject: [PATCH 103/157] pagserver: use vectored read path in benchmarks
 (#7498)

## Problem
Benchmarks don't use the vectored read path.

## Summary of changes
* Update the benchmarks to use the vectored read path for both singular
and vectored gets.
* Disable validation for the benchmarks
---
 .github/workflows/build_and_test.yml      | 4 ++++
 control_plane/src/local_env.rs            | 2 ++
 control_plane/src/pageserver.rs           | 7 +++++++
 test_runner/fixtures/neon_fixtures.py     | 7 +++++++
 test_runner/regress/test_compatibility.py | 3 ++-
 5 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 65b573663a..606564f209 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -478,6 +478,7 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
           PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
@@ -557,6 +558,9 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: false
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2168d4b944..8cbda528a7 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -130,6 +130,7 @@ pub struct PageServerConf {
     pub(crate) virtual_file_io_engine: Option<String>,
     pub(crate) get_vectored_impl: Option<String>,
     pub(crate) get_impl: Option<String>,
+    pub(crate) validate_vectored_get: Option<bool>,
 }
 
 impl Default for PageServerConf {
@@ -143,6 +144,7 @@ impl Default for PageServerConf {
             virtual_file_io_engine: None,
             get_vectored_impl: None,
             get_impl: None,
+            validate_vectored_get: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 0699e47866..52accc5890 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -93,6 +93,7 @@ impl PageServerNode {
             virtual_file_io_engine,
             get_vectored_impl,
             get_impl,
+            validate_vectored_get,
         } = &self.conf;
 
         let id = format!("id={}", id);
@@ -117,6 +118,11 @@ impl PageServerNode {
         } else {
             String::new()
         };
+        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
+            format!("validate_vectored_get={validate_vectored_get}")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -131,6 +137,7 @@ impl PageServerNode {
             virtual_file_io_engine,
             get_vectored_impl,
             get_impl,
+            validate_vectored_get,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 07db355d98..abe2718a49 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -512,6 +512,11 @@ class NeonEnvBuilder:
             self.pageserver_get_impl = "vectored"
             log.debug('Overriding pageserver get_impl config to "vectored"')
 
+        self.pageserver_validate_vectored_get: Optional[bool] = None
+        if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
+            self.pageserver_validate_vectored_get = bool(validate)
+            log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1085,6 +1090,8 @@ class NeonEnv:
                 ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
             if config.pageserver_get_impl is not None:
                 ps_cfg["get_impl"] = config.pageserver_get_impl
+            if config.pageserver_validate_vectored_get is not None:
+                ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 2a371eae72..e1ccb3e0c6 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -228,8 +228,9 @@ def test_forward_compatibility(
     try:
         # Previous version neon_local and pageserver are not aware
         # of the new config.
-        # TODO: remove this once the code reaches main
+        # TODO: remove these once the previous version of neon local supports them
         neon_env_builder.pageserver_get_impl = None
+        neon_env_builder.pageserver_validate_vectored_get = None
 
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath

From 89cae64e38a68045b1f748d5b15d5cd607c9958a Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 29 Apr 2024 12:33:01 -0400
Subject: [PATCH 104/157] chore(vm-image): specify sql exporter listen port
 (#7526)

Extracted from https://github.com/neondatabase/neon/pull/7514, 9399 is
the default port. We want to specify it b/c we will start a second sql
exporter for autoscaling agent soon.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c760744491..061ff38722 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -16,7 +16,7 @@ commands:
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:

From af7cca494930bad73ddd3f8eb21289000ddeb3ac Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 29 Apr 2024 17:35:08 +0100
Subject: [PATCH 105/157] pageserver: tweak vec get validation for ancestor lsn
 wait (#7533)

## Problem
Sequential get runs after vectored get, so it is possible for the later
to time out while waiting for its ancestor's Lsn to become ready and for
the former to succeed (it essentially has a doubled wait time).

## Summary of Changes
Relax the validation to allow for such rare cases.
---
 pageserver/src/tenant/timeline.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c10adf4c22..108acd3925 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1149,6 +1149,11 @@ impl Timeline {
                 panic!(concat!("Sequential get failed with {}, but vectored get did not",
                                " - keyspace={:?} lsn={}"),
                        seq_err, keyspace, lsn) },
+            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
+                // Sequential get runs after vectored get, so it is possible for the later 
+                // to time out while waiting for its ancestor's Lsn to become ready and for the
+                // former to succeed (it essentially has a doubled wait time).
+            },
             (Ok(_), Err(vec_err)) => {
                 panic!(concat!("Vectored get failed with {}, but sequential get did not",
                                " - keyspace={:?} lsn={}"),

From cddafc79e1d528e35cd9d2b5308aea2138790af1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 29 Apr 2024 19:02:53 +0200
Subject: [PATCH 106/157] Update azure_* crates to 0.19 (#7539)

Updates the four azure SDK crates used by remote_storage to 0.19.
---
 Cargo.lock | 65 +++++++++++++++++++++++++++---------------------------
 Cargo.toml |  8 +++----
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a130988409..de548bb2de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -722,9 +722,9 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
+checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
 dependencies = [
  "async-trait",
  "base64 0.21.1",
@@ -752,9 +752,9 @@ dependencies = [
 
 [[package]]
 name = "azure_identity"
-version = "0.18.1"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
+checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -772,9 +772,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
+checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -791,9 +791,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
+checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -812,9 +812,9 @@ dependencies = [
 
 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
+checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
 dependencies = [
  "azure_core",
  "bytes",
@@ -2763,9 +2763,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.63"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -6413,11 +6413,10 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.37"
+version = "0.1.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
 dependencies = [
- "cfg-if",
  "log",
  "pin-project-lite",
  "tracing-attributes",
@@ -6437,9 +6436,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.24"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6448,9 +6447,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.31"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6905,9 +6904,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -6915,9 +6914,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
@@ -6930,9 +6929,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.36"
+version = "0.4.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
+checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -6942,9 +6941,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -6952,9 +6951,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6965,9 +6964,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "wasm-streams"
@@ -6999,9 +6998,9 @@ dependencies = [
 
 [[package]]
 name = "web-sys"
-version = "0.3.63"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
diff --git a/Cargo.toml b/Cargo.toml
index 677eaa9ce4..92dcc254d4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.18"
-azure_identity = "0.18"
-azure_storage = "0.18"
-azure_storage_blobs = "0.18"
+azure_core = "0.19"
+azure_identity = "0.19"
+azure_storage = "0.19"
+azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"

From 11945e64ecec437caf5840edfa7a31ac765ce5e1 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 29 Apr 2024 13:16:42 -0400
Subject: [PATCH 107/157] chore(pageserver): improve in-memory layer vectored
 get (#7467)

previously in https://github.com/neondatabase/neon/pull/7375, we
observed that for in-memory layers, we will need to iterate every key in
the key space in order to get the result. The operation can be more
efficient if we use BTreeMap as the in-memory layer representation, even
if we are doing vectored get in a dense keyspace. Imagine a case that
the in-memory layer covers a very little part of the keyspace, and most
of the keys need to be found in lower layers. Using a BTreeMap can
significantly reduce probes for nonexistent keys.

## Summary of changes

* Use BTreeMap as in-memory layer representation.
* Optimize the vectored get flow to utilize the range scan functionality
of BTreeMap.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../tenant/storage_layer/inmemory_layer.rs    | 50 +++++++------------
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 8ec4d61434..5fb5d231c7 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BTreeMap, BinaryHeap, HashSet};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -78,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
 }
 
 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here.  Indexed
+    /// All versions of all pages in the layer are kept here. Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    index: HashMap<Key, VecMap<Lsn, u64>>,
+    index: BTreeMap<Key, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -384,25 +384,20 @@ impl InMemoryLayer {
         let mut planned_block_reads = BinaryHeap::new();
 
         for range in keyspace.ranges.iter() {
-            let mut key = range.start;
-            while key < range.end {
-                if let Some(vec_map) = inner.index.get(&key) {
-                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
-                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                        None => self.start_lsn..end_lsn,
-                    };
+            for (key, vec_map) in inner.index.range(range.start..range.end) {
+                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
+                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                    None => self.start_lsn..end_lsn,
+                };
 
-                    let slice = vec_map.slice_range(lsn_range);
-                    for (entry_lsn, pos) in slice.iter().rev() {
-                        planned_block_reads.push(BlockRead {
-                            key,
-                            lsn: *entry_lsn,
-                            block_offset: *pos,
-                        });
-                    }
+                let slice = vec_map.slice_range(lsn_range);
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    planned_block_reads.push(BlockRead {
+                        key: *key,
+                        lsn: *entry_lsn,
+                        block_offset: *pos,
+                    });
                 }
-
-                key = key.next();
             }
         }
 
@@ -499,7 +494,7 @@ impl InMemoryLayer {
             end_lsn: OnceLock::new(),
             opened_at: Instant::now(),
             inner: RwLock::new(InMemoryLayerInner {
-                index: HashMap::new(),
+                index: BTreeMap::new(),
                 file,
                 resource_units: GlobalResourceUnits::new(),
             }),
@@ -636,26 +631,17 @@ impl InMemoryLayer {
 
         let cursor = inner.file.block_cursor();
 
-        // Sort the keys because delta layer writer expects them sorted.
-        //
-        // NOTE: this sort can take up significant time if the layer has millions of
-        //       keys. To speed up all the comparisons we convert the key to i128 and
-        //       keep the value as a reference.
-        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
-        keys.sort_unstable_by_key(|k| k.0);
-
         let ctx = RequestContextBuilder::extend(ctx)
             .page_content_kind(PageContentKind::InMemoryLayer)
             .build();
-        for (key, vec_map) in keys.iter() {
-            let key = Key::from_i128(*key);
+        for (key, vec_map) in inner.index.iter() {
             // Write all page versions
             for (lsn, pos) in vec_map.as_slice() {
                 cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                 let will_init = Value::des(&buf)?.will_init();
                 let res;
                 (buf, res) = delta_layer_writer
-                    .put_value_bytes(key, *lsn, buf, will_init)
+                    .put_value_bytes(*key, *lsn, buf, will_init)
                     .await;
                 res?;
             }

From 574645412b376fac11125e9960f432ed0c99a44c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 29 Apr 2024 18:46:46 +0100
Subject: [PATCH 108/157] pageserver: shard-aware keyspace partitioning (#6778)

## Problem

Followup to https://github.com/neondatabase/neon/pull/6776

While #6776 makes compaction safe on sharded tenants, the logic for
keyspace partitioning remains inefficient: it assumes that the size of
data on a pageserver can be calculated simply as the range between start
and end of a Range -- this is not the case in sharded tenants, where
data within a range belongs to a variety of shards.

Closes: https://github.com/neondatabase/neon/issues/6774

## Summary of changes

I experimented with using a sharding-aware range type in KeySpace to
replace all the Range<Key> uses, but the impact on other code was quite
large (many places use the ranges), and not all of them need this
property of being able to approximate the physical size of data within a
key range.

So I compromised on expressing this as a ShardedRange type, but only
using that type selctively: during keyspace repartition, and in tiered
compaction when accumulating key ranges.

- keyspace partitioning methods take sharding parameters as an input
- new `ShardedRange` type wraps a Range<Key> and a shard identity
- ShardedRange::page_count is the shard-aware replacement for
key_range_size
- Callers that don't need to be shard-aware (e.g. vectored get code that
just wants to count the number of keys in a keyspace) can use
ShardedRange::raw_size to get the faster, shard-naive code (same as old
`key_range_size`)
- Compaction code is updated to carry a shard identity so that it can
use shard aware calculations
- Unit tests for the new fragmentation logic.
- Add a test for compaction on sharded tenants, that validates that we
generate appropriately sized image layers (this fails before fixing
keyspace partitioning)
---
 libs/pageserver_api/src/keyspace.rs           | 744 ++++++++++++++++--
 libs/pageserver_api/src/shard.rs              |   2 +-
 pageserver/compaction/src/compact_tiered.rs   |  22 +-
 pageserver/compaction/src/helpers.rs          |  11 +-
 pageserver/compaction/src/interface.rs        |  10 +-
 pageserver/compaction/src/simulator.rs        |   8 +-
 pageserver/src/basebackup.rs                  |   5 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +-
 pageserver/src/tenant/timeline.rs             |  12 +-
 pageserver/src/tenant/timeline/compaction.rs  |   6 +-
 test_runner/regress/test_compaction.py        | 101 +++
 11 files changed, 841 insertions(+), 82 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index eed4835f25..4283da18ab 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,7 +1,10 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
-use crate::key::Key;
+use crate::{
+    key::Key,
+    shard::{ShardCount, ShardIdentity},
+};
 use itertools::Itertools;
 
 ///
@@ -14,6 +17,234 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+/// Represents a contiguous half-open range of the keyspace, masked according to a particular
+/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
+/// shard.
+///
+/// When we iterate over keys within this object, we will skip any keys that don't belong
+/// to this shard.
+///
+/// The start + end keys may not belong to the shard: these specify where layer files should
+/// start  + end, but we will never actually read/write those keys.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ShardedRange<'a> {
+    pub shard_identity: &'a ShardIdentity,
+    pub range: Range<Key>,
+}
+
+// Calculate the size of a range within the blocks of the same relation, or spanning only the
+// top page in the previous relation's space.
+fn contiguous_range_len(range: &Range<Key>) -> u32 {
+    debug_assert!(is_contiguous_range(range));
+    if range.start.field6 == 0xffffffff {
+        range.end.field6 + 1
+    } else {
+        range.end.field6 - range.start.field6
+    }
+}
+
+/// Return true if this key range includes only keys in the same relation's data blocks, or
+/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
+///
+/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
+/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
+/// of a given contiguous range is present on one shard.
+///
+/// This matters, because:
+/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
+/// - Within such ranges, we may calculate distances using simple subtraction of field6.
+fn is_contiguous_range(range: &Range<Key>) -> bool {
+    range.start.field1 == range.end.field1
+        && range.start.field2 == range.end.field2
+        && range.start.field3 == range.end.field3
+        && range.start.field4 == range.end.field4
+        && (range.start.field5 == range.end.field5
+            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
+}
+
+impl<'a> ShardedRange<'a> {
+    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
+        Self {
+            shard_identity,
+            range,
+        }
+    }
+
+    /// Break up this range into chunks, each of which has at least one local key in it if the
+    /// total range has at least one local key.
+    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
+        // Optimization for single-key case (e.g. logical size keys)
+        if self.range.end == self.range.start.add(1) {
+            return vec![(
+                if self.shard_identity.is_key_disposable(&self.range.start) {
+                    0
+                } else {
+                    1
+                },
+                self.range,
+            )];
+        }
+
+        if !is_contiguous_range(&self.range) {
+            // Ranges that span relations are not fragmented.  We only get these ranges as a result
+            // of operations that act on existing layers, so we trust that the existing range is
+            // reasonably small.
+            return vec![(u32::MAX, self.range)];
+        }
+
+        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
+
+        let mut cursor = self.range.start;
+        while cursor < self.range.end {
+            let advance_by = self.distance_to_next_boundary(cursor);
+            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
+
+            // If the previous fragment is undersized, then we seek to consume enough
+            // blocks to complete it.
+            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
+                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
+                Some(frag) => {
+                    // Prev block is complete, want the full number.
+                    (
+                        target_nblocks,
+                        if is_fragment_disposable {
+                            // If this current range will be empty (not shard-local data), we will merge into previous
+                            Some(frag)
+                        } else {
+                            None
+                        },
+                    )
+                }
+                None => {
+                    // First iteration, want the full number
+                    (target_nblocks, None)
+                }
+            };
+
+            let advance_by = if is_fragment_disposable {
+                advance_by
+            } else {
+                std::cmp::min(advance_by, want_blocks)
+            };
+
+            let next_cursor = cursor.add(advance_by);
+
+            let this_frag = (
+                if is_fragment_disposable {
+                    0
+                } else {
+                    advance_by
+                },
+                cursor..next_cursor,
+            );
+            cursor = next_cursor;
+
+            if let Some(last_fragment) = merge_last_fragment {
+                // Previous fragment was short or this one is empty, merge into it
+                last_fragment.0 += this_frag.0;
+                last_fragment.1.end = this_frag.1.end;
+            } else {
+                fragments.push(this_frag);
+            }
+        }
+
+        fragments
+    }
+
+    /// Estimate the physical pages that are within this range, on this shard.  This returns
+    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
+    pub fn page_count(&self) -> u32 {
+        // Special cases for single keys like logical sizes
+        if self.range.end == self.range.start.add(1) {
+            return if self.shard_identity.is_key_disposable(&self.range.start) {
+                0
+            } else {
+                1
+            };
+        }
+
+        // We can only do an authentic calculation of contiguous key ranges
+        if !is_contiguous_range(&self.range) {
+            return u32::MAX;
+        }
+
+        // Special case for single sharded tenants: our logical and physical sizes are the same
+        if self.shard_identity.count < ShardCount::new(2) {
+            return contiguous_range_len(&self.range);
+        }
+
+        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
+        // to Self, and add the stripe's block count to our total if so.
+        let mut result: u64 = 0;
+        let mut cursor = self.range.start;
+        while cursor < self.range.end {
+            // Count up to the next stripe_size boundary or end of range
+            let advance_by = self.distance_to_next_boundary(cursor);
+
+            // If this blocks in this stripe belong to us, add them to our count
+            if !self.shard_identity.is_key_disposable(&cursor) {
+                result += advance_by as u64;
+            }
+
+            cursor = cursor.add(advance_by);
+        }
+
+        if result > u32::MAX as u64 {
+            u32::MAX
+        } else {
+            result as u32
+        }
+    }
+
+    /// Advance the cursor to the next potential fragment boundary: this is either
+    /// a stripe boundary, or the end of the range.
+    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
+        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
+
+        if self.shard_identity.count < ShardCount::new(2) {
+            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
+            return distance_to_range_end;
+        }
+
+        if cursor.field6 == 0xffffffff {
+            // We are wrapping from one relation's logical size to the next relation's first data block
+            return 1;
+        }
+
+        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
+        let stripe_remainder = self.shard_identity.stripe_size.0
+            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
+
+        if cfg!(debug_assertions) {
+            // We should never overflow field5 and field6 -- our callers check this earlier
+            // and would have returned their u32::MAX cases if the input range violated this.
+            let next_cursor = cursor.add(stripe_remainder);
+            debug_assert!(
+                next_cursor.field1 == cursor.field1
+                    && next_cursor.field2 == cursor.field2
+                    && next_cursor.field3 == cursor.field3
+                    && next_cursor.field4 == cursor.field4
+                    && next_cursor.field5 == cursor.field5
+            )
+        }
+
+        std::cmp::min(stripe_remainder, distance_to_range_end)
+    }
+
+    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
+    /// this function simply calculates the number of pages in the space, without accounting for those
+    /// pages that would not actually be stored on this node.
+    ///
+    /// Don't use this function in code that works with physical entities like layer files.
+    fn raw_size(range: &Range<Key>) -> u32 {
+        if is_contiguous_range(range) {
+            contiguous_range_len(range)
+        } else {
+            u32::MAX
+        }
+    }
+}
+
 impl KeySpace {
     /// Create a key space with a single range.
     pub fn single(key_range: Range<Key>) -> Self {
@@ -25,39 +256,36 @@ impl KeySpace {
     /// Partition a key space into roughly chunks of roughly 'target_size' bytes
     /// in each partition.
     ///
-    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
         // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as usize;
+        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
 
         let mut parts = Vec::new();
         let mut current_part = Vec::new();
         let mut current_part_size: usize = 0;
         for range in &self.ranges {
-            // If appending the next contiguous range in the keyspace to the current
-            // partition would cause it to be too large, start a new partition.
-            let this_size = key_range_size(range) as usize;
-            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
-                parts.push(KeySpace {
-                    ranges: current_part,
-                });
-                current_part = Vec::new();
-                current_part_size = 0;
-            }
+            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
+            // will respect shard striping rather than assuming all keys within a range are present.
+            let range = ShardedRange::new(range.clone(), shard_identity);
 
-            // If the next range is larger than 'target_size', split it into
-            // 'target_size' chunks.
-            let mut remain_size = this_size;
-            let mut start = range.start;
-            while remain_size > target_nblocks {
-                let next = start.add(target_nblocks as u32);
-                parts.push(KeySpace {
-                    ranges: vec![start..next],
-                });
-                start = next;
-                remain_size -= target_nblocks
+            // Chunk up the range into parts that each contain up to target_size local blocks
+            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
+                // If appending the next contiguous range in the keyspace to the current
+                // partition would cause it to be too large, and our current partition
+                // covers at least one block that is physically present in this shard,
+                // then start a new partition
+                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
+                    && current_part_size > 0
+                {
+                    parts.push(KeySpace {
+                        ranges: current_part,
+                    });
+                    current_part = Vec::new();
+                    current_part_size = 0;
+                }
+                current_part.push(frag_range.start..frag_range.end);
+                current_part_size += frag_on_shard_size as usize;
             }
-            current_part.push(start..range.end);
-            current_part_size += remain_size;
         }
 
         // add last partition that wasn't full yet.
@@ -71,7 +299,7 @@ impl KeySpace {
     }
 
     pub fn is_empty(&self) -> bool {
-        self.total_size() == 0
+        self.total_raw_size() == 0
     }
 
     /// Merge another keyspace into the current one.
@@ -164,11 +392,11 @@ impl KeySpace {
         self.ranges.last().map(|range| range.end)
     }
 
-    #[allow(unused)]
-    pub fn total_size(&self) -> usize {
+    /// The size of the keyspace in pages, before accounting for sharding
+    pub fn total_raw_size(&self) -> usize {
         self.ranges
             .iter()
-            .map(|range| key_range_size(range) as usize)
+            .map(|range| ShardedRange::raw_size(range) as usize)
             .sum()
     }
 
@@ -242,7 +470,7 @@ impl KeySpaceAccum {
 
     #[inline(always)]
     pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += key_range_size(&range) as u64;
+        self.size += ShardedRange::raw_size(&range) as u64;
 
         match self.accum.as_mut() {
             Some(accum) => {
@@ -274,7 +502,9 @@ impl KeySpaceAccum {
         std::mem::take(self).to_keyspace()
     }
 
-    pub fn size(&self) -> u64 {
+    // The total number of keys in this object, ignoring any sharding effects that might cause some of
+    // the keys to be omitted in storage on this shard.
+    pub fn raw_size(&self) -> u64 {
         self.size
     }
 }
@@ -330,36 +560,19 @@ impl KeySpaceRandomAccum {
     }
 }
 
-#[inline(always)]
-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
 pub fn singleton_range(key: Key) -> Range<Key> {
     key..key.next()
 }
 
 #[cfg(test)]
 mod tests {
+    use rand::{RngCore, SeedableRng};
+
+    use crate::{
+        models::ShardParameters,
+        shard::{ShardCount, ShardNumber},
+    };
+
     use super::*;
     use std::fmt::Write;
 
@@ -402,14 +615,17 @@ mod tests {
             accum.add_range(range.clone());
         }
 
-        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
-        assert_eq!(accum.size(), expected_size);
+        let expected_size: u64 = ranges
+            .iter()
+            .map(|r| ShardedRange::raw_size(r) as u64)
+            .sum();
+        assert_eq!(accum.raw_size(), expected_size);
 
         assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
 
         assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
 
         for range in &ranges {
             accum.add_range(range.clone());
@@ -706,4 +922,412 @@ mod tests {
             ]
         );
     }
+    #[test]
+    fn sharded_range_relation_gap() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
+                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
+            },
+            &shard_identity,
+        );
+
+        // Key range spans relations, expect MAX
+        assert_eq!(range.page_count(), u32::MAX);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_single_key() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
+                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
+            },
+            &shard_identity,
+        );
+        // Single-key range on logical size key
+        assert_eq!(range.page_count(), 1);
+    }
+
+    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
+    #[test]
+    fn contiguous_range_check() {
+        assert!(!is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
+        ),);
+
+        // The ranges goes all the way up to the 0xffffffff, including it: this is
+        // not considered a rel block range because 0xffffffff stores logical sizes,
+        // not blocks.
+        assert!(!is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
+        ),);
+
+        // Keys within the normal data region of a relation
+        assert!(is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
+        ),);
+
+        // The logical size key of one forkno, then some blocks in the next
+        assert!(is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
+        ),);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_forkno_gap() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
+                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
+            },
+            &shard_identity,
+        );
+
+        // Range spanning the end of one forkno and the start of the next: we do not attempt to
+        // calculate a valid size, because we have no way to know if they keys between start
+        // and end are actually in use.
+        assert_eq!(range.page_count(), u32::MAX);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_one_relation() {
+        for shard_number in 0..4 {
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardParameters::DEFAULT_STRIPE_SIZE,
+            )
+            .unwrap();
+
+            let range = ShardedRange::new(
+                Range {
+                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
+                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
+                },
+                &shard_identity,
+            );
+
+            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
+            if shard_number == 0 {
+                assert_eq!(range.page_count(), 1);
+            } else {
+                // Other shards should perceive the range's size as zero
+                assert_eq!(range.page_count(), 0);
+            }
+        }
+    }
+
+    /// Test helper: construct a ShardedRange and call fragment() on it, returning
+    /// the total page count in the range and the fragments.
+    fn do_fragment(
+        range_start: Key,
+        range_end: Key,
+        shard_identity: &ShardIdentity,
+        target_nblocks: u32,
+    ) -> (u32, Vec<(u32, Range<Key>)>) {
+        let range = ShardedRange::new(
+            Range {
+                start: range_start,
+                end: range_end,
+            },
+            shard_identity,
+        );
+
+        let page_count = range.page_count();
+        let fragments = range.fragment(target_nblocks);
+
+        // Invariant: we always get at least one fragment
+        assert!(!fragments.is_empty());
+
+        // Invariant: the first/last fragment start/end should equal the input start/end
+        assert_eq!(fragments.first().unwrap().1.start, range_start);
+        assert_eq!(fragments.last().unwrap().1.end, range_end);
+
+        if page_count > 0 {
+            // Invariant: every fragment must contain at least one shard-local page, if the
+            // total range contains at least one shard-local page
+            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
+            if !all_nonzero {
+                eprintln!("Found a zero-length fragment: {:?}", fragments);
+            }
+            assert!(all_nonzero);
+        } else {
+            // A range with no shard-local pages should always be returned as a single fragment
+            assert_eq!(fragments, vec![(0, range_start..range_end)]);
+        }
+
+        // Invariant: fragments must be ordered and non-overlapping
+        let mut last: Option<Range<Key>> = None;
+        for frag in &fragments {
+            if let Some(last) = last {
+                assert!(frag.1.start >= last.end);
+                assert!(frag.1.start > last.start);
+            }
+            last = Some(frag.1.clone())
+        }
+
+        // Invariant: fragments respect target_nblocks
+        for frag in &fragments {
+            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
+        }
+
+        (page_count, fragments)
+    }
+
+    /// Really simple tests for fragment(), on a range that just contains a single stripe
+    /// for a single tenant.
+    #[test]
+    fn sharded_range_fragment_simple() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        // A range which we happen to know covers exactly one stripe which belongs to this shard
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
+
+        // Ask for stripe_size blocks, we get the whole stripe
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 32768),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for more, we still get the whole stripe
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 10000000),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for target_nblocks of half the stripe size, we get two halves
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16384),
+            (
+                32768,
+                vec![
+                    (16384, input_start..input_start.add(16384)),
+                    (16384, input_start.add(16384)..input_end)
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_multi_stripe() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        // Ask for all the blocks, get a fragment that covers the whole range but reports
+        // its size to be just the blocks belonging to our shard.
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 131072),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for a sub-stripe quantity
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16000),
+            (
+                32768,
+                vec![
+                    (16000, input_start..input_start.add(16000)),
+                    (16000, input_start.add(16000)..input_start.add(32000)),
+                    (768, input_start.add(32000)..input_end),
+                ]
+            )
+        );
+
+        // Try on a range that starts slightly after our owned stripe
+        assert_eq!(
+            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
+            (32767, vec![(32767, input_start.add(1)..input_end)])
+        );
+    }
+
+    /// Test our calculations work correctly when we start a range from the logical size key of
+    /// a previous relation.
+    #[test]
+    fn sharded_range_fragment_starting_from_logical_size() {
+        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
+
+        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x10000),
+            (0x8001, vec![(0x8001, input_start..input_end)])
+        );
+
+        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
+        // store all logical sizes)
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x10000),
+            (0x1, vec![(0x1, input_start..input_end)])
+        );
+    }
+
+    /// Test that ShardedRange behaves properly when used on un-sharded data
+    #[test]
+    fn sharded_range_fragment_unsharded() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (
+                0x10000,
+                vec![
+                    (0x8000, input_start..input_start.add(0x8000)),
+                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_cross_relation() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
+        );
+
+        // Same, but using a sharded identity
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_tiny_nblocks() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
+        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16),
+            (
+                0x38,
+                vec![
+                    (16, input_start..input_start.add(16)),
+                    (16, input_start.add(16)..input_start.add(32)),
+                    (16, input_start.add(32)..input_start.add(48)),
+                    (8, input_start.add(48)..input_end),
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_fuzz() {
+        // Use a fixed seed: we don't want to explicitly pick values, but we do want
+        // the test to be reproducible.
+        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
+
+        for _i in 0..1000 {
+            let shard_identity = if prng.next_u32() % 2 == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                let shard_count = prng.next_u32() % 127 + 1;
+                ShardIdentity::new(
+                    ShardNumber((prng.next_u32() % shard_count) as u8),
+                    ShardCount::new(shard_count as u8),
+                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                )
+                .unwrap()
+            };
+
+            let target_nblocks = prng.next_u32() % 65536 + 1;
+
+            let start_offset = prng.next_u32() % 16384;
+
+            // Try ranges up to 4GiB in size, that are always at least 1
+            let range_size = prng.next_u32() % 8192 + 1;
+
+            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
+                .unwrap()
+                .add(start_offset);
+            let input_end = input_start.add(range_size);
+
+            // This test's main success conditions are the invariants baked into do_fragment
+            let (_total_size, fragments) =
+                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
+
+            // Pick a random key within the range and check it appears in the output
+            let example_key = input_start.add(prng.next_u32() % range_size);
+
+            // Panic on unwrap if it isn't found
+            let example_key_frag = fragments
+                .iter()
+                .find(|f| f.1.contains(&example_key))
+                .unwrap();
+
+            // Check that the fragment containing our random key has a nonzero size if
+            // that key is shard-local
+            let example_key_local = !shard_identity.is_key_disposable(&example_key);
+            if example_key_local {
+                assert!(example_key_frag.0 > 0);
+            }
+        }
+    }
 }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 6a8a5cc8f3..2d7f6772b2 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -451,7 +451,7 @@ impl ShardIdentity {
     /// An identity with number=0 count=0 is a "none" identity, which represents legacy
     /// tenants.  Modern single-shard tenants should not use this: they should
     /// have number=0 count=1.
-    pub fn unsharded() -> Self {
+    pub const fn unsharded() -> Self {
         Self {
             number: ShardNumber(0),
             count: ShardCount(0),
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 5261746b22..137b93055a 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -18,6 +18,7 @@
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
 use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
 use tracing::{debug, info};
 
 use std::collections::{HashSet, VecDeque};
@@ -125,6 +126,7 @@ async fn compact_level<E: CompactionJobExecutor>(
     }
 
     let mut state = LevelCompactionState {
+        shard_identity: *executor.get_shard_identity(),
         target_file_size,
         _lsn_range: lsn_range.clone(),
         layers: layer_fragments,
@@ -164,6 +166,8 @@ struct LevelCompactionState<'a, E>
 where
     E: CompactionJobExecutor,
 {
+    shard_identity: ShardIdentity,
+
     // parameters
     target_file_size: u64,
 
@@ -366,6 +370,7 @@ where
                 .executor
                 .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                 .await?,
+            &self.shard_identity,
         ) * 8192;
 
         let wal_size = job
@@ -430,7 +435,7 @@ where
             keyspace,
             self.target_file_size / 8192,
         );
-        while let Some(key_range) = window.choose_next_image() {
+        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
             new_jobs.push(CompactionJob::<E> {
                 key_range,
                 lsn_range: job.lsn_range.clone(),
@@ -623,7 +628,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
     }
 
     // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+    fn advance_until_size(
+        &mut self,
+        w: &KeyspaceWindowHead<K>,
+        max_size: u64,
+        shard_identity: &ShardIdentity,
+    ) {
         while self.accum_keysize < max_size && !self.reached_end(w) {
             let curr_range = &w.keyspace[self.keyspace_idx];
             if self.end_key < curr_range.start {
@@ -632,7 +642,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
             }
 
             // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
             if (self.accum_keysize + distance as u64) < max_size {
                 // oh yeah, it fits
                 self.end_key = curr_range.end;
@@ -641,7 +651,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
             } else {
                 // advance within the range
                 let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
+                let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
                 if (self.accum_keysize + distance as u64) < max_size {
                     self.end_key = skip_key;
                     self.accum_keysize += distance as u64;
@@ -677,7 +687,7 @@ where
         }
     }
 
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
+    fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
         if self.start_pos.keyspace_idx == self.head.keyspace.len() {
             // we've reached the end
             return None;
@@ -687,6 +697,7 @@ where
         next_pos.advance_until_size(
             &self.head,
             self.start_pos.accum_keysize + self.head.target_keysize,
+            shard_identity,
         );
 
         // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -695,6 +706,7 @@ where
         end_pos.advance_until_size(
             &self.head,
             self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+            shard_identity,
         );
         if end_pos.reached_end(&self.head) {
             // gobble up any unused keyspace between the last used key and end of the range
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 9de6363d6e..1b80373ba7 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -5,6 +5,7 @@ use crate::interface::*;
 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
+use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
@@ -13,11 +14,17 @@ use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
 
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+pub fn keyspace_total_size<K>(
+    keyspace: &CompactionKeySpace<K>,
+    shard_identity: &ShardIdentity,
+) -> u64
 where
     K: CompactionKey,
 {
-    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+    keyspace
+        .iter()
+        .map(|r| K::key_range_size(r, shard_identity) as u64)
+        .sum()
 }
 
 pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 5dc62e506f..35519b5d0a 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,7 +4,7 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use futures::Future;
-use pageserver_api::{key::Key, keyspace::key_range_size};
+use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
 use std::ops::Range;
 use utils::lsn::Lsn;
 
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
     // Functions that the planner uses to support its decisions
     // ----
 
+    fn get_shard_identity(&self) -> &ShardIdentity;
+
     /// Return all layers that overlap the given bounding box.
     fn get_layers(
         &mut self,
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
     ///
     /// This returns u32, for compatibility with Repository::key. If the
     /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
+    fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
 
     // return "self + 1"
     fn next(&self) -> Self;
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
     const MIN: Self = Self::MIN;
     const MAX: Self = Self::MAX;
 
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
-        key_range_size(r)
+    fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
+        ShardedRange::new(r.clone(), shard_identity).page_count()
     }
     fn next(&self) -> Key {
         (self as &Key).next()
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 6c00df3a65..3543df64fa 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -3,6 +3,7 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 
 use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;
 
@@ -71,7 +72,7 @@ impl interface::CompactionKey for Key {
     const MIN: Self = u64::MIN;
     const MAX: Self = u64::MAX;
 
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
+    fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
         std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
     }
 
@@ -434,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
     type ImageLayer = Arc<MockImageLayer>;
     type RequestContext = MockRequestContext;
 
+    fn get_shard_identity(&self) -> &ShardIdentity {
+        static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
+        &IDENTITY
+    }
+
     async fn get_layers(
         &mut self,
         key_range: &Range<Self::Key>,
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ba047745f1..8c51e93643 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -263,7 +263,10 @@ where
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
                 .await?
-                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+                .partition(
+                    self.timeline.get_shard_identity(),
+                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+                );
 
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5fb5d231c7..1a85481e97 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -401,7 +401,7 @@ impl InMemoryLayer {
             }
         }
 
-        let keyspace_size = keyspace.total_size();
+        let keyspace_size = keyspace.total_raw_size();
 
         let mut completed_keys = HashSet::new();
         while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 108acd3925..c5068386d6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -936,7 +936,7 @@ impl Timeline {
             return Err(GetVectoredError::InvalidLsn(lsn));
         }
 
-        let key_count = keyspace.total_size().try_into().unwrap();
+        let key_count = keyspace.total_raw_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
@@ -1076,7 +1076,7 @@ impl Timeline {
         mut reconstruct_state: ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let get_kind = if keyspace.total_size() == 1 {
+        let get_kind = if keyspace.total_raw_size() == 1 {
             GetKind::Singular
         } else {
             GetKind::Vectored
@@ -3207,7 +3207,7 @@ impl Timeline {
                 }
             }
 
-            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
+            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }
 
@@ -3220,7 +3220,7 @@ impl Timeline {
             timeline = &*timeline_owned;
         }
 
-        if keyspace.total_size() != 0 {
+        if keyspace.total_raw_size() != 0 {
             return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
         }
 
@@ -3911,7 +3911,7 @@ impl Timeline {
         }
 
         let keyspace = self.collect_keyspace(lsn, ctx).await?;
-        let partitioning = keyspace.partition(partition_size);
+        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
 
         *partitioning_guard = (partitioning, lsn);
 
@@ -4064,7 +4064,7 @@ impl Timeline {
                     key = key.next();
 
                     // Maybe flush `key_rest_accum`
-                    if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                         || last_key_in_range
                     {
                         let results = self
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8075775bbc..b92832a3de 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,7 +15,7 @@ use anyhow::{anyhow, Context};
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -831,6 +831,10 @@ impl CompactionJobExecutor for TimelineAdaptor {
 
     type RequestContext = crate::context::RequestContext;
 
+    fn get_shard_identity(&self) -> &ShardIdentity {
+        self.timeline.get_shard_identity()
+    }
+
     async fn get_layers(
         &mut self,
         key_range: &Range<Key>,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 3902819d3d..43a3323462 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,4 +1,6 @@
+import json
 import os
+from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -89,3 +91,102 @@ page_cache_size=10
     # was chosen empirically for this workload.
     assert non_vectored_average < 8
     assert vectored_average < 8
+
+
+# Stripe sizes in number of pages.
+TINY_STRIPES = 16
+LARGE_STRIPES = 32768
+
+
+@pytest.mark.parametrize(
+    "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)]
+)
+def test_sharding_compaction(
+    neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int]
+):
+    """
+    Use small stripes, small layers, and small compaction thresholds to exercise how compaction
+    and image layer generation interacts with sharding.
+
+    We are looking for bugs that might emerge from the way sharding uses sparse layer files that
+    only contain some of the keys in the key range covered by the layer, such as errors estimating
+    the size of layers that might result in too-small layer files.
+    """
+
+    compaction_target_size = 128 * 1024
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{compaction_target_size}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly: we want to exercise image layer creation in this test.
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": 0,
+    }
+
+    neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(64)
+    for _i in range(0, 10):
+        # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
+        # these should result in image layers each time we write some data into a shard, and also shards
+        # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
+        # rather than asserting)
+        workload.churn_rows(64)
+
+    # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes
+    # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job.
+    shard_has_image_layers = []
+    for shard in env.storage_controller.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
+        image_layer_sizes = {}
+        for layer in layer_map.historic_layers:
+            if layer.kind == "Image":
+                image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
+
+                # Pageserver should assert rather than emit an empty layer file, but double check here
+                assert layer.layer_file_size is not None
+                assert layer.layer_file_size > 0
+
+        shard_has_image_layers.append(len(image_layer_sizes) > 1)
+        log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}")
+
+        if stripe_size == TINY_STRIPES:
+            # Checking the average size validates that our keyspace partitioning is  properly respecting sharding: if
+            # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical
+            # data in a keyrange.
+            #
+            # We only do this check with tiny stripes, because large stripes may not give all shards enough
+            # data to have statistically significant image layers
+            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
+            log.info(f"Shard {shard_id} average image layer size: {avg_size}")
+            assert avg_size > compaction_target_size / 2
+
+    if stripe_size == TINY_STRIPES:
+        # Expect writes were scattered across all pageservers: they should all have compacted some image layers
+        assert all(shard_has_image_layers)
+    else:
+        # With large stripes, it is expected that most of our writes went to one pageserver, so we just require
+        # that at least one of them has some image layers.
+        assert any(shard_has_image_layers)
+
+    # Assert that everything is still readable
+    workload.validate()

From 577982b7782aceaa0782ef4295663d72d39b09aa Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 11:04:54 +0100
Subject: [PATCH 109/157] pageserver: remove workarounds from #7454 (#7550)

PR #7454 included a workaround that let any existing bugged databases
start up. Having used that already, we may now

Closes: https://github.com/neondatabase/neon/issues/7480
---
 libs/pageserver_api/src/shard.rs | 18 ------------------
 pageserver/src/basebackup.rs     | 17 ++---------------
 2 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 2d7f6772b2..d769b2fd2f 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -538,24 +538,6 @@ impl ShardIdentity {
         }
     }
 
-    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
-    ///
-    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
-    /// as a symptom of that issue.
-    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
-        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
-            return false;
-        }
-
-        let mut hash = murmurhash32(key.field4);
-        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
-        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
-
-        // The key may be affected by issue #7454: it is an initfork and it would not
-        // have mapped to shard 0 until we fixed that issue.
-        mapped_shard != ShardNumber(0)
-    }
-
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 8c51e93643..53abd8bfb9 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -300,20 +300,7 @@ where
                 if rel.forknum == INIT_FORKNUM {
                     // I doubt we need _init fork itself, but having it at least
                     // serves as a marker relation is unlogged.
-                    if let Err(_e) = self.add_rel(rel, rel).await {
-                        if self
-                            .timeline
-                            .get_shard_identity()
-                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
-                        {
-                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
-                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
-                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
-                            // recreate.
-                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
-                            continue;
-                        }
-                    };
+                    self.add_rel(rel, rel).await?;
                     self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                     continue;
                 }

From 84b6b95783eaecea06b40e2e87ddcdd70aa9e504 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Tue, 30 Apr 2024 14:17:01 +0100
Subject: [PATCH 110/157] docs: fix unintentional file link (#7506)

Not sure if this should actually be a link pointing to the
`persistence.rs` file but following the conventions of the rest of the
file, change `persistence.rs` reference to simply be a file name
mention.
---
 docs/storage_controller.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/storage_controller.md b/docs/storage_controller.md
index 4cb796edaa..daf4d0c8b7 100644
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.
 
-The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
 
 The `diesel` crate is used for defining models & migrations.
 

From 45c625fb349c3dbe711e5868bfa389da298bc960 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 30 Apr 2024 09:39:10 -0400
Subject: [PATCH 111/157] feat(pageserver): separate sparse and dense keyspace 
 (#7503)

extracted (and tested) from
https://github.com/neondatabase/neon/pull/7468, part of
https://github.com/neondatabase/neon/issues/7462.

The current codebase assumes the keyspace is dense -- which means that
if we have a keyspace of 0x00-0x100, we assume every key (e.g., 0x00,
0x01, 0x02, ...) exists in the storage engine. However, the assumption
does not hold any more in metadata keyspace. The metadata keyspace is
sparse. It is impossible to do per-key check.

Ideally, we should not have the assumption of dense keyspace at all, but
this would incur a lot of refactors. Therefore, we split the keyspaces
we have to dense/sparse and handle them differently in the code for now.
At some point in the future, we should assume all keyspaces are sparse.

## Summary of changes

* Split collect_keyspace to return dense+sparse keyspace.
* Do not allow generating image layers for sparse keyspace (for now --
will fix this next week, we need image layers anyways).
* Generate delta layers for sparse keyspace.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/keyspace.rs           |  27 ++
 .../pageserver_api/src/models/partitioning.rs |  14 +-
 pageserver/src/http/routes.rs                 |   6 +-
 pageserver/src/pgdatadir_mapping.rs           |  12 +-
 pageserver/src/tenant/layer_map.rs            |   1 +
 .../tenant/storage_layer/inmemory_layer.rs    |  24 +-
 pageserver/src/tenant/timeline.rs             | 251 ++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs  |  37 ++-
 8 files changed, 269 insertions(+), 103 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 4283da18ab..a9ad3aca18 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,10 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+/// A wrapper type for sparse keyspaces.
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct SparseKeySpace(pub KeySpace);
+
 /// Represents a contiguous half-open range of the keyspace, masked according to a particular
 /// ShardNumber's stripes: within this range of keys, only some "belong" to the current
 /// shard.
@@ -435,10 +439,33 @@ pub struct KeyPartitioning {
     pub parts: Vec<KeySpace>,
 }
 
+/// Represents a partitioning of the sparse key space.
+#[derive(Clone, Debug, Default)]
+pub struct SparseKeyPartitioning {
+    pub parts: Vec<SparseKeySpace>,
+}
+
 impl KeyPartitioning {
     pub fn new() -> Self {
         KeyPartitioning { parts: Vec::new() }
     }
+
+    /// Convert a key partitioning to a sparse partition.
+    pub fn into_sparse(self) -> SparseKeyPartitioning {
+        SparseKeyPartitioning {
+            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
+        }
+    }
+}
+
+impl SparseKeyPartitioning {
+    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
+    /// cause long/dead loops.
+    pub fn into_dense(self) -> KeyPartitioning {
+        KeyPartitioning {
+            parts: self.parts.into_iter().map(|x| x.0).collect(),
+        }
+    }
 }
 
 ///
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
index 0d287f7be0..f6644be635 100644
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,9 +1,11 @@
 use utils::lsn::Lsn;
 
+use crate::keyspace::SparseKeySpace;
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
     pub keys: crate::keyspace::KeySpace,
-
+    pub sparse_keys: crate::keyspace::SparseKeySpace,
     pub at_lsn: Lsn,
 }
 
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
         let mut map = serializer.serialize_map(Some(2))?;
         map.serialize_key("keys")?;
         map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("sparse_keys")?;
+        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
         map.serialize_key("at_lsn")?;
         map.serialize_value(&WithDisplay(&self.at_lsn))?;
         map.end()
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
         #[derive(serde::Deserialize)]
         struct De {
             keys: KeySpace,
+            sparse_keys: KeySpace,
             #[serde_as(as = "serde_with::DisplayFromStr")]
             at_lsn: Lsn,
         }
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
         Ok(Self {
             at_lsn: de.at_lsn,
             keys: de.keys.0,
+            sparse_keys: SparseKeySpace(de.sparse_keys.0),
         })
     }
 }
@@ -133,6 +139,12 @@ mod tests {
                 "030000000000000000000000000000000003"
               ]
             ],
+            "sparse_keys": [
+              [
+                "620000000000000000000000000000000000",
+                "620000000000000000000000000000000003"
+              ]
+            ],
             "at_lsn": "0/2240160"
         }
         "#;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 9a280c2e0c..ae1e7aac78 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1918,12 +1918,14 @@ async fn timeline_collect_keyspace(
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let keys = timeline
+        let (dense_ks, sparse_ks) = timeline
             .collect_keyspace(at_lsn, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
-        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
+        // Therefore, we split dense/sparse keys in this API.
+        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
 
         json_response(StatusCode::OK, res)
     }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c76c2d5451..015191b875 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::{
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -730,11 +731,13 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
     pub(crate) async fn collect_keyspace(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<KeySpace, CollectKeySpaceError> {
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -806,7 +809,12 @@ impl Timeline {
         if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
             result.add_key(AUX_FILES_KEY);
         }
-        Ok(result.to_keyspace())
+
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
+        ))
     }
 
     /// Get cached size of relation if it not updated after specified LSN
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 4c4cd90c99..3c4de8fe4d 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -916,6 +916,7 @@ mod tests {
         assert_eq!(lhs, rhs);
     }
 
+    #[cfg(test)]
     fn brute_force_range_search(
         layer_map: &LayerMap,
         key_range: Range<Key>,
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 1a85481e97..a2ae8ec29d 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -597,14 +597,17 @@ impl InMemoryLayer {
         }
     }
 
-    /// Write this frozen in-memory layer to disk.
+    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
+    /// layer will only contain the key range the user specifies, and may return `None`
+    /// if there are no matching keys.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
     pub(crate) async fn write_to_disk(
         &self,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> Result<Option<ResidentLayer>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -618,6 +621,21 @@ impl InMemoryLayer {
 
         let end_lsn = *self.end_lsn.get().unwrap();
 
+        let keys: Vec<_> = if let Some(key_range) = key_range {
+            inner
+                .index
+                .iter()
+                .filter(|(k, _)| key_range.contains(k))
+                .map(|(k, m)| (k.to_i128(), m))
+                .collect()
+        } else {
+            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
+        };
+
+        if keys.is_empty() {
+            return Ok(None);
+        }
+
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
@@ -649,6 +667,6 @@ impl InMemoryLayer {
 
         // MAX is used here because we identify L0 layers by full key range
         let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(delta_layer)
+        Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c5068386d6..2a2c5d4ee5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,7 +17,7 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
-    keyspace::KeySpaceAccum,
+    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
         EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
@@ -55,7 +55,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -66,6 +65,7 @@ use crate::{
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -137,6 +137,25 @@ pub(super) enum FlushLoopState {
     Exited,
 }
 
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum ImageLayerCreationMode {
+    /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
+    Try,
+    /// Force creating the image layers if possible. For now, no image layers will be created
+    /// for metadata keys. Used in compaction code path with force flag enabled.
+    Force,
+    /// Initial ingestion of the data, and no data should be dropped in this function. This
+    /// means that no metadata keys should be included in the partitions. Used in flush frozen layer
+    /// code path.
+    Initial,
+}
+
+impl std::fmt::Display for ImageLayerCreationMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
@@ -317,7 +336,7 @@ pub struct Timeline {
     pub initdb_lsn: Lsn,
 
     /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -2104,7 +2123,10 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((
+                    (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
+                    Lsn(0),
+                )),
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
 
@@ -3106,7 +3128,6 @@ impl Timeline {
             if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 let layer = guard.get_from_desc(&layer);
                 drop(guard);
-
                 // Get all the data needed to reconstruct the page version from this layer.
                 // But if we have an older cached page image, no need to go past that.
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3227,7 +3248,7 @@ impl Timeline {
         Ok(())
     }
 
-    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    /// Collect the reconstruct data for a keyspace from the specified timeline.
     ///
     /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
     /// the current keyspace. The current keyspace of the search at any given timeline
@@ -3656,66 +3677,103 @@ impl Timeline {
         // files instead. This is possible as long as *all* the data imported into the
         // repository have the same LSN.
         let lsn_range = frozen_layer.get_lsn_range();
-        let (layers_to_upload, delta_layer_to_add) =
-            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        initdb_optimization_count,
-                        ..
-                    } => {
+
+        // Whether to directly create image layers for this flush, or flush them as delta layers
+        let create_image_layer =
+            lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
+
+        #[cfg(test)]
+        {
+            match &mut *self.flush_loop_state.lock().unwrap() {
+                FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                    panic!("flush loop not running")
+                }
+                FlushLoopState::Running {
+                    expect_initdb_optimization,
+                    initdb_optimization_count,
+                    ..
+                } => {
+                    if create_image_layer {
                         *initdb_optimization_count += 1;
-                    }
-                }
-                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
-                // require downloading anything during initial import.
-                let (partitioning, _lsn) = self
-                    .repartition(
-                        self.initdb_lsn,
-                        self.get_compaction_target_size(),
-                        EnumSet::empty(),
-                        ctx,
-                    )
-                    .await?;
-
-                if self.cancel.is_cancelled() {
-                    return Err(FlushLayerError::Cancelled);
-                }
-
-                // For image layers, we add them immediately into the layer map.
-                (
-                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
-                        .await?,
-                    None,
-                )
-            } else {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        expect_initdb_optimization,
-                        ..
-                    } => {
+                    } else {
                         assert!(!*expect_initdb_optimization, "expected initdb optimization");
                     }
                 }
-                // Normal case, write out a L0 delta layer file.
-                // `create_delta_layer` will not modify the layer map.
-                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
-                (
-                    // FIXME: even though we have a single image and single delta layer assumption
-                    // we push them to vec
-                    vec![layer.clone()],
-                    Some(layer),
+            }
+        }
+
+        let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
+            // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+            // require downloading anything during initial import.
+            let ((rel_partition, metadata_partition), _lsn) = self
+                .repartition(
+                    self.initdb_lsn,
+                    self.get_compaction_target_size(),
+                    EnumSet::empty(),
+                    ctx,
                 )
+                .await?;
+
+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single aux file keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                assert_eq!(
+                    metadata_keyspace.0.ranges.len(),
+                    1,
+                    "aux file keyspace should be a single range"
+                );
+                self.create_delta_layer(
+                    &frozen_layer,
+                    ctx,
+                    Some(metadata_keyspace.0.ranges[0].clone()),
+                )
+                .await?
+            } else {
+                None
             };
 
+            // For image layers, we add them immediately into the layer map.
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &rel_partition,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
+        } else {
+            // Normal case, write out a L0 delta layer file.
+            // `create_delta_layer` will not modify the layer map.
+            // We will remove frozen layer and add delta layer in one atomic operation later.
+            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+                panic!("delta layer cannot be empty if no filter is applied");
+            };
+            (
+                // FIXME: even though we have a single image and single delta layer assumption
+                // we push them to vec
+                vec![layer.clone()],
+                Some(layer),
+            )
+        };
+
         pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
 
         if self.cancel.is_cancelled() {
@@ -3835,12 +3893,18 @@ impl Timeline {
         self: &Arc<Self>,
         frozen_layer: &Arc<InMemoryLayer>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> anyhow::Result<Option<ResidentLayer>> {
         let self_clone = Arc::clone(self);
         let frozen_layer = Arc::clone(frozen_layer);
         let ctx = ctx.attached_child();
         let work = async move {
-            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            let Some(new_delta) = frozen_layer
+                .write_to_disk(&self_clone, &ctx, key_range)
+                .await?
+            else {
+                return Ok(None);
+            };
             // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
@@ -3859,7 +3923,7 @@ impl Timeline {
                 .sync_all()
                 .await
                 .fatal_err("VirtualFile::sync_all timeline dir");
-            anyhow::Ok(new_delta)
+            anyhow::Ok(Some(new_delta))
         };
         // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
         // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3886,19 +3950,20 @@ impl Timeline {
         partition_size: u64,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
+    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
         let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
             anyhow::bail!("repartition() called concurrently, this should not happen");
         };
-        if lsn < partitioning_guard.1 {
+        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
+        if lsn < *partition_lsn {
             anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
         }
 
-        let distance = lsn.0 - partitioning_guard.1 .0;
-        if partitioning_guard.1 != Lsn(0)
+        let distance = lsn.0 - partition_lsn.0;
+        if *partition_lsn != Lsn(0)
             && distance <= self.repartition_threshold
             && !flags.contains(CompactFlags::ForceRepartition)
         {
@@ -3907,13 +3972,18 @@ impl Timeline {
                 threshold = self.repartition_threshold,
                 "no repartitioning needed"
             );
-            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            return Ok((
+                (dense_partition.clone(), sparse_partition.clone()),
+                *partition_lsn,
+            ));
         }
 
-        let keyspace = self.collect_keyspace(lsn, ctx).await?;
-        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
-
-        *partitioning_guard = (partitioning, lsn);
+        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
+        let sparse_partitioning = SparseKeyPartitioning {
+            parts: vec![sparse_ks],
+        }; // no partitioning for metadata keys for now
+        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
 
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
@@ -3969,12 +4039,12 @@ impl Timeline {
         false
     }
 
-    #[tracing::instrument(skip_all, fields(%lsn, %force))]
+    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
-        force: bool,
+        mode: ImageLayerCreationMode,
         ctx: &RequestContext,
     ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4011,19 +4081,26 @@ impl Timeline {
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
 
-            let do_it = if force {
-                true
-            } else if check_for_image_layers {
-                // [`Self::time_for_new_image_layer`] is CPU expensive,
-                // so skip if we've not collected enough WAL since the last time
-                self.time_for_new_image_layer(partition, lsn).await
-            } else {
-                false
-            };
-
-            if !do_it {
-                start = img_range.end;
-                continue;
+            if partition.overlaps(&Key::metadata_key_range()) {
+                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
+                // rather big change. Keep this patch small for now.
+                match mode {
+                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
+                        // skip image layer creation anyways for metadata keys.
+                        start = img_range.end;
+                        continue;
+                    }
+                    ImageLayerCreationMode::Initial => {
+                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                    }
+                }
+            } else if let ImageLayerCreationMode::Try = mode {
+                // check_for_image_layers = false -> skip
+                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
+                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                    start = img_range.end;
+                    continue;
+                }
             }
 
             let mut image_layer_writer = ImageLayerWriter::new(
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index b92832a3de..6ea37bf793 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
 
 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -102,7 +102,7 @@ impl Timeline {
             )
             .await
         {
-            Ok((partitioning, lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                 let image_ctx = RequestContextBuilder::extend(ctx)
                     .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,17 +115,37 @@ impl Timeline {
 
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
-                let layers = self
+                let dense_layers = self
                     .create_image_layers(
-                        &partitioning,
+                        &dense_partitioning,
                         lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
                         &image_ctx,
                     )
                     .await
                     .map_err(anyhow::Error::from)?;
 
-                self.upload_new_image_layers(layers)?;
+                // For now, nothing will be produced...
+                let sparse_layers = self
+                    .create_image_layers(
+                        &sparse_partitioning.clone().into_dense(),
+                        lsn,
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                assert!(sparse_layers.is_empty());
+
+                self.upload_new_image_layers(dense_layers)?;
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -758,8 +778,9 @@ impl Timeline {
             return Err(CompactionError::ShuttingDown);
         }
 
-        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
+        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
 
         pageserver_compaction::compact_tiered::compact_tiered(
             &mut adaptor,

From eb53345d48b14d2ad474a8983a09c42d82ca2e5d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 15:16:15 +0100
Subject: [PATCH 112/157] pageserver: reduce runtime of init_tenant_mgr (#7553)

## Problem

`init_tenant_mgr` blocks the rest of pageserver startup, including
starting the admin API.

This was noticeable in #7475 , where the init_tenant_mgr runtime could
be long enough to trip the controller's 30 second heartbeat timeout.

## Summary of changes

- When detaching tenants during startup, spawn the background deletes as
background tasks instead of doing them inline
- Write all configs before spawning any tenants, so that the config
writes aren't fighting tenants for system resources
- Write configs with some concurrency (16) rather than writing them all
sequentially.
---
 pageserver/src/tenant/mgr.rs | 105 +++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 37 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2c9476ba0a..006d501daa 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -253,17 +254,15 @@ impl TenantsMap {
     }
 }
 
+/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
+/// the slower actual deletion in the background.
+///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
-async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
-    let tmp_path = safe_rename_tenant_dir(path).await?;
-    fs::remove_dir_all(tmp_path).await
-}
-
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
     let parent = path
         .as_ref()
@@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
     Ok(tmp_path)
 }
 
+/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+/// the background, and thereby avoid blocking any API requests on this deletion completing.
+fn spawn_background_purge(tmp_path: Utf8PathBuf) {
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+}
+
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
@@ -570,7 +591,11 @@ pub async fn init_tenant_mgr(
     );
     TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
 
-    // Construct `Tenant` objects and start them running
+    // Accumulate futures for writing tenant configs, so that we can execute in parallel
+    let mut config_write_futs = Vec::new();
+
+    // Update the location configs according to the re-attach response and persist them to disk
+    tracing::info!("Updating {} location configs", tenant_configs.len());
     for (tenant_shard_id, location_conf) in tenant_configs {
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
 
@@ -597,18 +622,22 @@ pub async fn init_tenant_mgr(
         const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
             SecondaryLocationConfig { warm: true };
 
-        // Update the location config according to the re-attach response
         if let Some(tenant_modes) = &tenant_modes {
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
             match tenant_modes.get(&tenant_shard_id) {
                 None => {
                     info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                        );
-                    }
+
+                    match safe_rename_tenant_dir(&tenant_dir_path).await {
+                        Ok(tmp_path) => {
+                            spawn_background_purge(tmp_path);
+                        }
+                        Err(e) => {
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
+                        }
+                    };
 
                     // We deleted local content: move on to next tenant, don't try and spawn this one.
                     continue;
@@ -654,8 +683,32 @@ pub async fn init_tenant_mgr(
 
         // Presence of a generation number implies attachment: attach the tenant
         // if it wasn't already, and apply the generation number.
-        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
+        config_write_futs.push(async move {
+            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            (tenant_shard_id, location_conf, r)
+        });
+    }
 
+    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
+    tracing::info!(
+        "Writing {} location config files...",
+        config_write_futs.len()
+    );
+    let config_write_results = futures::stream::iter(config_write_futs)
+        .buffer_unordered(16)
+        .collect::<Vec<_>>()
+        .await;
+
+    tracing::info!(
+        "Spawning {} tenant shard locations...",
+        config_write_results.len()
+    );
+    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
+    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
+        // Errors writing configs are fatal
+        config_write_result?;
+
+        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
             LocationMode::Attached(attached_conf) => {
@@ -1699,7 +1752,7 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);
 
         fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
             "failpoint"
@@ -1854,28 +1907,6 @@ impl TenantManager {
         shutdown_all_tenants0(self.tenants).await
     }
 
-    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-    /// the background, and thereby avoid blocking any API requests on this deletion completing.
-    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
-        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-        let task_tenant_id = None;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            task_tenant_id,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
-    }
-
     pub(crate) async fn detach_tenant(
         &self,
         conf: &'static PageServerConf,
@@ -1892,7 +1923,7 @@ impl TenantManager {
                 deletion_queue_client,
             )
             .await?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);
 
         Ok(())
     }

From 010f0a310a83b5ab7101165ade9f3284a69a4bfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:52:54 +0200
Subject: [PATCH 113/157] Make test_random_updates and test_read_at_max_lsn
 compatible with new compaction (#7551)

Makes two of the tests work with the tiered compaction that I had to
ignore in #7283.

The issue was that tiered compaction actually created image layers, but
the keys didn't appear in them as `collect_keyspace` didn't include
them. Not a compaction problem, but due to how the test is structured.

Fixes #7287
---
 pageserver/src/tenant.rs | 66 +++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cb3e36efb3..05ceff2b59 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3873,6 +3873,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::key::NON_INHERITED_RANGE;
     use pageserver_api::keyspace::KeySpace;
+    use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4512,11 +4513,23 @@ mod tests {
     }
 
     async fn bulk_insert_compact_gc(
+        timeline: Arc<Timeline>,
+        ctx: &RequestContext,
+        lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let compact = true;
+        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
+    }
+
+    async fn bulk_insert_maybe_compact_gc(
         timeline: Arc<Timeline>,
         ctx: &RequestContext,
         mut lsn: Lsn,
         repeat: usize,
         key_count: usize,
+        compact: bool,
     ) -> anyhow::Result<()> {
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let mut blknum = 0;
@@ -4557,9 +4570,11 @@ mod tests {
                 )
                 .await?;
             timeline.freeze_and_flush().await?;
-            timeline
-                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                .await?;
+            if compact {
+                timeline
+                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                    .await?;
+            }
             timeline.gc().await?;
         }
 
@@ -5042,7 +5057,22 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_random_updates")?;
+        let names_algorithms = [
+            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
+            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_random_updates_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_random_updates_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5107,7 +5137,7 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of flush, compact, and GC
+            // Perform a cycle of flush, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
                 .update_gc_info(
@@ -5119,9 +5149,6 @@ mod tests {
                 )
                 .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
-                .await?;
             tline.gc().await?;
         }
 
@@ -5402,19 +5429,36 @@ mod tests {
 
     #[tokio::test]
     async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let names_algorithms = [
+            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
+            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_read_at_max_lsn_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_read_at_max_lsn_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        let compact = false;
+        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
 
         let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let read_lsn = Lsn(u64::MAX - 1);
 
-        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+        let result = tline.get(test_key, read_lsn, &ctx).await;
+        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
 
         Ok(())
     }

From 3da54e6d90c7befdff50df48206fa441c24b6e94 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 3 Apr 2024 15:46:54 +0300
Subject: [PATCH 114/157] s3_scrubber: implement scan-metadata for safekeepers.

It works by listing postgres table with memory dump of safekeepers state. s3
contents for each timeline are checked then against timeline_start_lsn and
backup_lsn. If inconsistency is found, before complaining timeline (branch) is
checked at control plane; it might have been deleted between the dump take and
s3 check.
---
 Cargo.lock                                    |  21 ++
 Cargo.toml                                    |   2 +-
 s3_scrubber/Cargo.toml                        |   4 +
 s3_scrubber/README.md                         |  10 +-
 s3_scrubber/src/lib.rs                        |  16 +-
 s3_scrubber/src/main.rs                       | 109 ++++++--
 ...etadata.rs => scan_pageserver_metadata.rs} |   0
 s3_scrubber/src/scan_safekeeper_metadata.rs   | 234 ++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 9 files changed, 363 insertions(+), 37 deletions(-)
 rename s3_scrubber/src/{scan_metadata.rs => scan_pageserver_metadata.rs} (100%)
 create mode 100644 s3_scrubber/src/scan_safekeeper_metadata.rs

diff --git a/Cargo.lock b/Cargo.lock
index de548bb2de..f2f06210cf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3184,6 +3184,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3520,6 +3530,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5095,8 +5111,11 @@ dependencies = [
  "hex",
  "histogram",
  "itertools",
+ "native-tls",
  "pageserver",
  "pageserver_api",
+ "postgres-native-tls",
+ "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
  "reqwest",
@@ -5105,6 +5124,7 @@ dependencies = [
  "serde_with",
  "thiserror",
  "tokio",
+ "tokio-postgres",
  "tokio-rustls 0.25.0",
  "tokio-stream",
  "tracing",
@@ -6507,6 +6527,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
+ "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index 92dcc254d4..32a0bc23e6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 0ee9112010..37124e6caf 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,7 +22,11 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
+postgres_ffi.workspace = true
 tokio-stream.workspace = true
+tokio-postgres.workspace = true
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md
index 2f21b9f191..c1deab8852 100644
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.
 
 #### `scan-metadata`
 
-Walk objects in a pageserver S3 bucket, and report statistics on the contents.
+Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
+Errors are logged to stderr and summary to stdout.
 
+For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
 
 Timelines: 31106
 With errors: 3
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
 ```
 
+For safekeepers, dump_db_connstr and dump_db_table must be
+specified; they should point to table with debug dump which will be used
+to list timelines and find their backup and start LSNs.
+
 ## Cleaning up running pageservers
 
 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 90d58a3bc2..43be258150 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,7 +4,8 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod scan_metadata;
+pub mod scan_pageserver_metadata;
+pub mod scan_safekeeper_metadata;
 pub mod tenant_snapshot;
 
 use std::env;
@@ -141,12 +142,17 @@ impl RootTarget {
     pub fn tenants_root(&self) -> S3Target {
         match self {
             Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.with_sub_segment("wal"),
+            Self::Safekeeper(root) => root.clone(),
         }
     }
 
     pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
-        self.tenants_root().with_sub_segment(&tenant_id.to_string())
+        match self {
+            Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
+            Self::Safekeeper(_) => self
+                .tenants_root()
+                .with_sub_segment(&tenant_id.tenant_id.to_string()),
+        }
     }
 
     pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
@@ -337,9 +343,7 @@ fn init_remote(
         }),
         NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("safekeeper/v1".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal".to_string()),
             delimiter,
         }),
     };
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index 88ba9bfa61..e49c280b99 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,9 +1,13 @@
+use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::scan_pageserver_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
+use s3_scrubber::{
+    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
+    NodeKind, TraversingDepth,
+};
 
 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
@@ -35,11 +39,20 @@ enum Command {
         #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
         mode: PurgeMode,
     },
+    #[command(verbatim_doc_comment)]
     ScanMetadata {
+        #[arg(short, long)]
+        node_kind: NodeKind,
         #[arg(short, long, default_value_t = false)]
         json: bool,
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
+        #[arg(long, default_value = None)]
+        /// For safekeeper node_kind only, points to db with debug dump
+        dump_db_connstr: Option<String>,
+        /// For safekeeper node_kind only, table in the db with debug dump
+        #[arg(long, default_value = None)]
+        dump_db_table: Option<String>,
     },
     TenantSnapshot {
         #[arg(long = "tenant-id")]
@@ -72,33 +85,75 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json, tenant_ids } => {
-            match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                Err(e) => {
-                    tracing::error!("Failed: {e}");
-                    Err(e)
+        Command::ScanMetadata {
+            json,
+            tenant_ids,
+            node_kind,
+            dump_db_connstr,
+            dump_db_table,
+        } => {
+            if let NodeKind::Safekeeper = node_kind {
+                let dump_db_connstr =
+                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
+                let dump_db_table =
+                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
+
+                let summary = scan_safekeeper_metadata(
+                    bucket_config.clone(),
+                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
+                    dump_db_connstr,
+                    dump_db_table,
+                )
+                .await?;
+                if json {
+                    println!("{}", serde_json::to_string(&summary).unwrap())
+                } else {
+                    println!("{}", summary.summary_string());
                 }
-                Ok(summary) => {
-                    if json {
-                        println!("{}", serde_json::to_string(&summary).unwrap())
-                    } else {
-                        println!("{}", summary.summary_string());
+                if summary.is_fatal() {
+                    bail!("Fatal scrub errors detected");
+                }
+                if summary.is_empty() {
+                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                    // scrubber they were likely expecting to scan something, and if we see no timelines
+                    // at all then it's likely due to some configuration issues like a bad prefix
+                    bail!(
+                        "No timelines found in bucket {} prefix {}",
+                        bucket_config.bucket,
+                        bucket_config
+                            .prefix_in_bucket
+                            .unwrap_or("<none>".to_string())
+                    );
+                }
+                Ok(())
+            } else {
+                match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                    Err(e) => {
+                        tracing::error!("Failed: {e}");
+                        Err(e)
                     }
-                    if summary.is_fatal() {
-                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                    } else if summary.is_empty() {
-                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                        // scrubber they were likely expecting to scan something, and if we see no timelines
-                        // at all then it's likely due to some configuration issues like a bad prefix
-                        Err(anyhow::anyhow!(
-                            "No timelines found in bucket {} prefix {}",
-                            bucket_config.bucket,
-                            bucket_config
-                                .prefix_in_bucket
-                                .unwrap_or("<none>".to_string())
-                        ))
-                    } else {
-                        Ok(())
+                    Ok(summary) => {
+                        if json {
+                            println!("{}", serde_json::to_string(&summary).unwrap())
+                        } else {
+                            println!("{}", summary.summary_string());
+                        }
+                        if summary.is_fatal() {
+                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                        } else if summary.is_empty() {
+                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                            // scrubber they were likely expecting to scan something, and if we see no timelines
+                            // at all then it's likely due to some configuration issues like a bad prefix
+                            Err(anyhow::anyhow!(
+                                "No timelines found in bucket {} prefix {}",
+                                bucket_config.bucket,
+                                bucket_config
+                                    .prefix_in_bucket
+                                    .unwrap_or("<none>".to_string())
+                            ))
+                        } else {
+                            Ok(())
+                        }
                     }
                 }
             }
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs
similarity index 100%
rename from s3_scrubber/src/scan_metadata.rs
rename to s3_scrubber/src/scan_pageserver_metadata.rs
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
new file mode 100644
index 0000000000..f56bc165db
--- /dev/null
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -0,0 +1,234 @@
+use std::{collections::HashSet, str::FromStr};
+
+use aws_sdk_s3::Client;
+use futures::stream::{StreamExt, TryStreamExt};
+use pageserver_api::shard::TenantShardId;
+use postgres_ffi::{XLogFileName, PG_TLI};
+use serde::Serialize;
+use tokio_postgres::types::PgLsn;
+use tracing::{error, info, trace};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::{
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+
+/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
+const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
+
+#[derive(Serialize)]
+pub struct MetadataSummary {
+    timeline_count: usize,
+    with_errors: HashSet<TenantTimelineId>,
+    deleted_count: usize,
+}
+
+impl MetadataSummary {
+    fn new() -> Self {
+        Self {
+            timeline_count: 0,
+            with_errors: HashSet::new(),
+            deleted_count: 0,
+        }
+    }
+
+    pub fn summary_string(&self) -> String {
+        format!(
+            "timeline_count: {}, with_errors: {}",
+            self.timeline_count,
+            self.with_errors.len()
+        )
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.timeline_count == 0
+    }
+
+    pub fn is_fatal(&self) -> bool {
+        !self.with_errors.is_empty()
+    }
+}
+
+/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
+/// statistics.
+///
+/// It works by listing timelines along with timeline_start_lsn and backup_lsn
+/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
+/// segments are missing, before complaining control plane is queried to check if
+/// the project wasn't deleted in the meanwhile.
+pub async fn scan_safekeeper_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantId>,
+    dump_db_connstr: String,
+    dump_db_table: String,
+) -> anyhow::Result<MetadataSummary> {
+    info!(
+        "checking bucket {}, region {}, dump_db_table {}",
+        bucket_config.bucket, bucket_config.region, dump_db_table
+    );
+    // Use the native TLS implementation (Neon requires TLS)
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let tenant_filter_clause = if !tenant_ids.is_empty() {
+        format!(
+            "and tenant_id in ({})",
+            tenant_ids
+                .iter()
+                .map(|t| format!("'{}'", t))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    } else {
+        "".to_owned()
+    };
+    let query = format!(
+        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
+        dump_db_table, tenant_filter_clause,
+    );
+    info!("query is {}", query);
+    let timelines = client.query(&query, &[]).await?;
+    info!("loaded {} timelines", timelines.len());
+
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let console_config = ConsoleConfig::from_env()?;
+    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
+
+    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
+        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
+        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
+        let timeline_start_lsn_pg: PgLsn = row.get(2);
+        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
+        let backup_lsn_pg: PgLsn = row.get(3);
+        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
+        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        check_timeline(
+            &s3_client,
+            &target,
+            &cloud_admin_api_client,
+            ttid,
+            timeline_start_lsn,
+            backup_lsn,
+        )
+    });
+    // Run multiple check_timeline's concurrently.
+    const CONCURRENCY: usize = 32;
+    let mut timelines = checks.try_buffered(CONCURRENCY);
+
+    let mut summary = MetadataSummary::new();
+    while let Some(r) = timelines.next().await {
+        let res = r?;
+        summary.timeline_count += 1;
+        if !res.is_ok {
+            summary.with_errors.insert(res.ttid);
+        }
+        if res.is_deleted {
+            summary.deleted_count += 1;
+        }
+    }
+
+    Ok(summary)
+}
+
+struct TimelineCheckResult {
+    ttid: TenantTimelineId,
+    is_ok: bool,
+    is_deleted: bool, // timeline is deleted in cplane
+}
+
+/// List s3 and check that is has all expected WAL for the ttid. Consistency
+/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
+/// Ok(false) if not, Err if failed to check.
+async fn check_timeline(
+    s3_client: &Client,
+    root: &RootTarget,
+    api_client: &CloudAdminApiClient,
+    ttid: TenantTimelineId,
+    timeline_start_lsn: Lsn,
+    backup_lsn: Lsn,
+) -> anyhow::Result<TimelineCheckResult> {
+    trace!(
+        "checking ttid {}, should contain WAL [{}-{}]",
+        ttid,
+        timeline_start_lsn,
+        backup_lsn
+    );
+    // calculate expected segfiles
+    let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
+    let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
+    let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
+        (expected_first_segno..expected_last_segno)
+            .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
+    );
+    let expected_files_num = expected_segfiles.len();
+    trace!("expecting {} files", expected_segfiles.len(),);
+
+    // now list s3 and check if it misses something
+    let ttshid =
+        TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
+    let mut timeline_dir_target = root.timeline_root(&ttshid);
+    // stream_listing yields only common_prefixes if delimiter is not empty, but
+    // we need files, so unset it.
+    timeline_dir_target.delimiter = String::new();
+
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let obj = obj?;
+        let key = obj.key();
+
+        let seg_name = key
+            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .expect("failed to extract segment name");
+        expected_segfiles.remove(seg_name);
+    }
+    if !expected_segfiles.is_empty() {
+        // Before complaining check cplane, probably timeline is already deleted.
+        let bdata = api_client.find_timeline_branch(ttid.timeline_id).await?;
+        let deleted = match bdata {
+            Some(bdata) => bdata.deleted,
+            None => {
+                // note: should be careful with selecting proper cplane address
+                info!("ttid {} not found, assuming it is deleted", ttid);
+                true
+            }
+        };
+        if deleted {
+            // ok, branch is deleted
+            return Ok(TimelineCheckResult {
+                ttid,
+                is_ok: true,
+                is_deleted: true,
+            });
+        }
+        error!(
+            "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
+            ttid,
+            expected_segfiles.len(),
+            expected_files_num,
+            timeline_start_lsn,
+            backup_lsn,
+        );
+        return Ok(TimelineCheckResult {
+            ttid,
+            is_ok: false,
+            is_deleted: false,
+        });
+    }
+    Ok(TimelineCheckResult {
+        ttid,
+        is_ok: true,
+        is_deleted: false,
+    })
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index abe2718a49..fa83ebdccb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3734,7 +3734,9 @@ class S3Scrubber:
         return stdout
 
     def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
+        stdout = self.scrubber_cli(
+            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
+        )
 
         try:
             return json.loads(stdout)

From ea37234cccb6448383bbb7d76e315a7db1af3125 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 5 Apr 2024 14:29:35 +0300
Subject: [PATCH 115/157] s3_scrubber: revive garbage collection for
 safekeepers.

- pageserver_id in project details is now is optional, fix it
- add active_timeline_count guard/stat similar to active_tenant_count
- fix safekeeper prefix
- count and log deleted keys
---
 s3_scrubber/src/cloud_admin_api.rs |  2 +-
 s3_scrubber/src/garbage.rs         | 54 +++++++++++++++++++++++++++++-
 s3_scrubber/src/lib.rs             |  2 +-
 s3_scrubber/src/metadata_stream.rs |  4 +--
 4 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 45cac23690..66ca2f7180 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -137,7 +137,7 @@ pub struct ProjectData {
     pub region_id: String,
     pub platform_id: String,
     pub user_id: String,
-    pub pageserver_id: u64,
+    pub pageserver_id: Option<u64>,
     #[serde(deserialize_with = "from_nullable_id")]
     pub tenant: TenantId,
     pub safekeepers: Vec<SafekeeperData>,
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 7a08dffc66..de3b16b49b 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -60,6 +60,7 @@ pub struct GarbageList {
     /// see garbage, we saw some active tenants too.  This protects against classes of bugs
     /// in the scrubber that might otherwise generate a "deleted all" result.
     active_tenant_count: usize,
+    active_timeline_count: usize,
 }
 
 impl GarbageList {
@@ -67,6 +68,7 @@ impl GarbageList {
         Self {
             items: Vec::new(),
             active_tenant_count: 0,
+            active_timeline_count: 0,
             node_kind,
             bucket_config,
         }
@@ -221,6 +223,7 @@ async fn find_garbage_inner(
         } else {
             tracing::debug!("Tenant {tenant_shard_id} is active");
             active_tenants.push(tenant_shard_id);
+            garbage.active_tenant_count = active_tenants.len();
         }
 
         counter += 1;
@@ -271,15 +274,29 @@ async fn find_garbage_inner(
         std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Update the GarbageList with any timelines which appear not to exist.
+    let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
     while let Some(result) = timelines_checked.next().await {
         let (ttid, console_result) = result?;
         if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
             tracing::debug!("Timeline {ttid} is garbage");
         } else {
             tracing::debug!("Timeline {ttid} is active");
+            active_timelines.push(ttid);
+            garbage.active_timeline_count = active_timelines.len();
         }
     }
 
+    let num_garbage_timelines = garbage
+        .items
+        .iter()
+        .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        .count();
+    tracing::info!(
+        "Found {}/{} garbage timelines in active tenants",
+        num_garbage_timelines,
+        active_timelines.len(),
+    );
+
     Ok(garbage)
 }
 
@@ -344,16 +361,22 @@ pub async fn get_timeline_objects(
 const MAX_KEYS_PER_DELETE: usize = 1000;
 
 /// Drain a buffer of keys into DeleteObjects requests
+///
+/// If `drain` is true, drains keys completely; otherwise stops when <
+/// MAX_KEYS_PER_DELETE keys are left.
+/// `num_deleted` returns number of deleted keys.
 async fn do_delete(
     s3_client: &Arc<Client>,
     bucket_name: &str,
     keys: &mut Vec<ObjectIdentifier>,
     dry_run: bool,
     drain: bool,
+    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
     while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
         let request_keys =
             keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+        let num_deleted = request_keys.len();
         if dry_run {
             tracing::info!("Dry-run deletion of objects: ");
             for k in request_keys {
@@ -368,12 +391,30 @@ async fn do_delete(
                 .send()
                 .await
                 .context("DeleteObjects request")?;
+            progress_tracker.register(num_deleted);
         }
     }
 
     Ok(())
 }
 
+/// Simple tracker reporting each 10k deleted keys.
+#[derive(Default)]
+struct DeletionProgressTracker {
+    num_deleted: usize,
+    last_reported_num_deleted: usize,
+}
+
+impl DeletionProgressTracker {
+    fn register(&mut self, n: usize) {
+        self.num_deleted += n;
+        if self.num_deleted - self.last_reported_num_deleted > 10000 {
+            tracing::info!("progress: deleted {} keys", self.num_deleted);
+            self.last_reported_num_deleted = self.num_deleted;
+        }
+    }
+}
+
 pub async fn purge_garbage(
     input_path: String,
     mode: PurgeMode,
@@ -394,6 +435,14 @@ pub async fn purge_garbage(
     if garbage_list.active_tenant_count == 0 {
         anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
     }
+    if garbage_list
+        .items
+        .iter()
+        .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        && garbage_list.active_timeline_count == 0
+    {
+        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
+    }
 
     let filtered_items = garbage_list
         .items
@@ -429,6 +478,7 @@ pub async fn purge_garbage(
         std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
 
     let mut objects_to_delete = Vec::new();
+    let mut progress_tracker = DeletionProgressTracker::default();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
         objects_to_delete.append(&mut object_list);
@@ -439,6 +489,7 @@ pub async fn purge_garbage(
                 &mut objects_to_delete,
                 dry_run,
                 false,
+                &mut progress_tracker,
             )
             .await?;
         }
@@ -450,10 +501,11 @@ pub async fn purge_garbage(
         &mut objects_to_delete,
         dry_run,
         true,
+        &mut progress_tracker,
     )
     .await?;
 
-    tracing::info!("Fell through");
+    tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
 
     Ok(())
 }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 43be258150..78ad9d0da7 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -343,7 +343,7 @@ fn init_remote(
         }),
         NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
             delimiter,
         }),
     };
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index b192e0be2e..c05874f556 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -114,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
     let timelines_target = target.timelines_root(&tenant);
 
     loop {
-        tracing::info!("Listing in {}", tenant);
+        tracing::debug!("Listing in {}", tenant);
         let fetch_response =
             list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
                 .await;
@@ -151,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
         }
     }
 
-    tracing::info!("Yielding for {}", tenant);
+    tracing::debug!("Yielding for {}", tenant);
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;

From 7434674d86d8064122b9cc80529ca989ec3b0a88 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 5 Apr 2024 20:25:09 +0300
Subject: [PATCH 116/157] Decrease CONSOLE_CONCURRENCY.

Last run with 128 created too much load on cplane.
---
 s3_scrubber/src/garbage.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index de3b16b49b..91f5fa4334 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -121,7 +121,10 @@ pub async fn find_garbage(
 const S3_CONCURRENCY: usize = 32;
 
 // How many concurrent API requests to make to the console API.
-const CONSOLE_CONCURRENCY: usize = 128;
+//
+// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
+// would be better to implement real rsp limiter.
+const CONSOLE_CONCURRENCY: usize = 16;
 
 struct ConsoleCache {
     /// Set of tenants found in the control plane API

From 9f792f9c0b9758320848a6aeb7e720af6d3eafdf Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 8 Apr 2024 07:56:04 +0300
Subject: [PATCH 117/157] Recheck tenant_id in find_timeline_branch.

As it turns out we have at least one case of the same timeline_id in different
projects.
---
 s3_scrubber/src/cloud_admin_api.rs          | 36 +++++++++++++--------
 s3_scrubber/src/garbage.rs                  |  2 +-
 s3_scrubber/src/scan_safekeeper_metadata.rs |  4 ++-
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 66ca2f7180..d35dc7e3ca 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -155,7 +155,7 @@ pub struct ProjectData {
     pub maintenance_set: Option<String>,
 }
 
-#[derive(Debug, serde::Deserialize)]
+#[derive(Debug, Clone, serde::Deserialize)]
 pub struct BranchData {
     pub id: BranchId,
     pub created_at: DateTime<Utc>,
@@ -327,6 +327,7 @@ impl CloudAdminApiClient {
 
     pub async fn find_timeline_branch(
         &self,
+        tenant_id: TenantId,
         timeline_id: TimelineId,
     ) -> Result<Option<BranchData>, Error> {
         let _permit = self
@@ -359,19 +360,28 @@ impl CloudAdminApiClient {
                 ErrorKind::BodyRead(e),
             )
         })?;
-        match response.data.len() {
-            0 => Ok(None),
-            1 => Ok(Some(
-                response
-                    .data
-                    .into_iter()
-                    .next()
-                    .expect("Should have exactly one element"),
-            )),
-            too_many => Err(Error::new(
-                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
+        let mut branches: Vec<BranchData> = response.data.into_iter().collect();
+        // Normally timeline_id is unique. However, we do have at least one case
+        // of the same timeline_id in two different projects, apparently after
+        // manual recovery. So always recheck project_id (discovered through
+        // tenant_id).
+        let project_data = match self.find_tenant_project(tenant_id).await? {
+            Some(pd) => pd,
+            None => return Ok(None),
+        };
+        branches.retain(|b| b.project_id == project_data.id);
+        if branches.len() < 2 {
+            Ok(branches.first().cloned())
+        } else {
+            Err(Error::new(
+                format!(
+                    "Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
+                    tenant_id,
+                    timeline_id,
+                    branches.len()
+                ),
                 ErrorKind::UnexpectedState,
-            )),
+            ))
         }
     }
 
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 91f5fa4334..ce0ff10ec6 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -267,7 +267,7 @@ async fn find_garbage_inner(
         let api_client = cloud_admin_api_client.clone();
         async move {
             api_client
-                .find_timeline_branch(ttid.timeline_id)
+                .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
                 .await
                 .map_err(|e| anyhow::anyhow!(e))
                 .map(|r| (ttid, r))
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
index f56bc165db..73dd49ceb5 100644
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -195,7 +195,9 @@ async fn check_timeline(
     }
     if !expected_segfiles.is_empty() {
         // Before complaining check cplane, probably timeline is already deleted.
-        let bdata = api_client.find_timeline_branch(ttid.timeline_id).await?;
+        let bdata = api_client
+            .find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
+            .await?;
         let deleted = match bdata {
             Some(bdata) => bdata.deleted,
             None => {

From 4ac4b2159838f9b98d766d53a5f876fedb94c2e4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 9 Apr 2024 07:18:26 +0300
Subject: [PATCH 118/157] Add retries to cloud_admin client.

---
 Cargo.lock                         |   1 +
 s3_scrubber/Cargo.toml             |   1 +
 s3_scrubber/src/cloud_admin_api.rs | 189 ++++++++++++++++-------------
 3 files changed, 108 insertions(+), 83 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f2f06210cf..2b100560dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5127,6 +5127,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-rustls 0.25.0",
  "tokio-stream",
+ "tokio-util",
  "tracing",
  "tracing-appender",
  "tracing-subscriber",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 37124e6caf..dd5d453a2b 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -27,6 +27,7 @@ postgres-native-tls.workspace = true
 postgres_ffi.workspace = true
 tokio-stream.workspace = true
 tokio-postgres.workspace = true
+tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index d35dc7e3ca..70b108cf23 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,13 @@
-use std::time::Duration;
-
 use chrono::{DateTime, Utc};
+use futures::Future;
 use hex::FromHex;
+
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
 
+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
@@ -210,30 +212,39 @@ impl CloudAdminApiClient {
             .await
             .expect("Semaphore is not closed");
 
-        let response = self
-            .http_client
-            .get(self.append_url("/projects"))
-            .query(&[
-                ("tenant_id", tenant_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find project for tenant".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/projects"))
+                    .query(&[
+                        ("tenant_id", tenant_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
+
+                let response: AdminApiResponse<Vec<ProjectData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_tenant_project",
+        )
+        .await?;
 
-        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find project for tenant".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
         match response.data.len() {
             0 => Ok(None),
             1 => Ok(Some(
@@ -261,42 +272,34 @@ impl CloudAdminApiClient {
         const PAGINATION_LIMIT: usize = 512;
         let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
         loop {
-            let response = self
-                .http_client
-                .get(self.append_url("/projects"))
-                .query(&[
-                    ("show_deleted", "false".to_string()),
-                    ("limit", format!("{PAGINATION_LIMIT}")),
-                    ("offset", format!("{pagination_offset}")),
-                ])
-                .header(header::ACCEPT, "application/json")
-                .bearer_auth(&self.token)
-                .send()
-                .await
-                .map_err(|e| {
-                    Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::RequestSend(e),
-                    )
-                })?;
+            let response_bytes = CloudAdminApiClient::with_retries(
+                || async {
+                    let response = self
+                        .http_client
+                        .get(self.append_url("/projects"))
+                        .query(&[
+                            ("show_deleted", "false".to_string()),
+                            ("limit", format!("{PAGINATION_LIMIT}")),
+                            ("offset", format!("{pagination_offset}")),
+                        ])
+                        .header(header::ACCEPT, "application/json")
+                        .bearer_auth(&self.token)
+                        .send()
+                        .await
+                        .map_err(|e| {
+                            Error::new(
+                                "List active projects".to_string(),
+                                ErrorKind::RequestSend(e),
+                            )
+                        })?;
 
-            match response.status() {
-                StatusCode::OK => {}
-                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
-                    tokio::time::sleep(Duration::from_millis(500)).await;
-                    continue;
-                }
-                _status => {
-                    return Err(Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::ResponseStatus(response.status()),
-                    ))
-                }
-            }
-
-            let response_bytes = response.bytes().await.map_err(|e| {
-                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-            })?;
+                    response.bytes().await.map_err(|e| {
+                        Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+                    })
+                },
+                "list_projects",
+            )
+            .await?;
 
             let decode_result =
                 serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -336,30 +339,39 @@ impl CloudAdminApiClient {
             .await
             .expect("Semaphore is not closed");
 
-        let response = self
-            .http_client
-            .get(self.append_url("/branches"))
-            .query(&[
-                ("timeline_id", timeline_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find branch for timeline".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/branches"))
+                    .query(&[
+                        ("timeline_id", timeline_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
+
+                let response: AdminApiResponse<Vec<BranchData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_timeline_branch",
+        )
+        .await?;
 
-        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find branch for timeline".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
         let mut branches: Vec<BranchData> = response.data.into_iter().collect();
         // Normally timeline_id is unique. However, we do have at least one case
         // of the same timeline_id in two different projects, apparently after
@@ -542,4 +554,15 @@ impl CloudAdminApiClient {
             .parse()
             .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
     }
+
+    async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
+    where
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, Error>>,
+    {
+        let cancel = CancellationToken::new(); // not really used
+        backoff::retry(op, |_| false, 1, 20, description, &cancel)
+            .await
+            .expect("cancellations are disabled")
+    }
 }

From 3a2f10712ad557c978f966579c9bfa89ad6f4bae Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 10 Apr 2024 22:52:57 +0300
Subject: [PATCH 119/157] Add more context to s3 listing error.

---
 s3_scrubber/src/lib.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 78ad9d0da7..e976e66748 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -368,7 +368,10 @@ async fn list_objects_with_retries(
         {
             Ok(response) => return Ok(response),
             Err(e) => {
-                error!("list_objects_v2 query failed: {e}");
+                error!(
+                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
+                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                );
                 tokio::time::sleep(Duration::from_secs(1)).await;
             }
         }

From a74b60066c7e0d4679d0d2ae7cfce6cd2f488e6e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 16:21:54 +0100
Subject: [PATCH 120/157] storage controller: test for large shard counts
 (#7475)

## Problem

Storage controller was observed to have unexpectedly large memory
consumption when loaded with many thousands of shards.

This was recently fixed:
- https://github.com/neondatabase/neon/pull/7493

...but we need a general test that the controller is well behaved with
thousands of shards.

Closes: https://github.com/neondatabase/neon/issues/7460
Closes: https://github.com/neondatabase/neon/issues/7463

## Summary of changes

- Add test test_storage_controller_many_tenants to exercise the system's
behaviour with a more substantial workload. This test measures memory
consumption and reproduces #7460 before the other changes in this PR.
- Tweak reconcile_all's return value to make it nonzero if it spawns no
reconcilers, but _would_ have spawned some reconcilers if they weren't
blocked by the reconcile concurrency limit. This makes the test's
reconcile_until_idle behave as expected (i.e. not complete until the
system is nice and calm).
- Fix an issue where tenant migrations would leave a spurious secondary
location when migrated to some location that was not already their
secondary (this was an existing low-impact bug that tripped up the
test's consistency checks).

On the test with 8000 shards, the resident memory per shard is about
20KiB. This is not really per-shard memory: the primary source of memory
growth is the number of concurrent network/db clients we create.

With 8000 shards, the test takes 125s to run on my workstation.
---
 Cargo.lock                                    |   1 +
 control_plane/Cargo.toml                      |   1 +
 control_plane/src/bin/neon_local.rs           |   4 +-
 control_plane/src/local_env.rs                |  28 +++
 control_plane/src/storage_controller.rs       |  14 +-
 storage_controller/src/service.rs             |  18 +-
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/fixtures/compute_reconfigure.py   |  11 +
 test_runner/fixtures/neon_fixtures.py         |  34 ++-
 .../test_storage_controller_scale.py          | 198 ++++++++++++++++++
 10 files changed, 292 insertions(+), 21 deletions(-)
 create mode 100644 test_runner/performance/test_storage_controller_scale.py

diff --git a/Cargo.lock b/Cargo.lock
index 2b100560dd..e4bf71c64f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1319,6 +1319,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
+ "humantime-serde",
  "hyper 0.14.26",
  "nix 0.27.1",
  "once_cell",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index b544a8c587..2ce041068e 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,6 +17,7 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
+humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1a9e9a1e6a..bdd64c8687 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1554,8 +1554,8 @@ fn cli() -> Command {
             Command::new("storage_controller")
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .subcommand(Command::new("start").about("Start storage controller"))
+                .subcommand(Command::new("stop").about("Stop storage controller")
                             .arg(stop_mode_arg.clone()))
         )
         .subcommand(
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 8cbda528a7..59b587389c 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,6 +17,7 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
+use std::time::Duration;
 use utils::{
     auth::{encode_from_key_file, Claims},
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -66,6 +67,10 @@ pub struct LocalEnv {
 
     pub broker: NeonBroker,
 
+    // Configuration for the storage controller (1 per neon_local environment)
+    #[serde(default)]
+    pub storage_controller: NeonStorageControllerConf,
+
     /// This Vec must always contain at least one pageserver
     pub pageservers: Vec<PageServerConf>,
 
@@ -98,6 +103,29 @@ pub struct NeonBroker {
     pub listen_addr: SocketAddr,
 }
 
+/// Broker config for cluster internal communication.
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct NeonStorageControllerConf {
+    /// Heartbeat timeout before marking a node offline
+    #[serde(with = "humantime_serde")]
+    pub max_unavailable: Duration,
+}
+
+impl NeonStorageControllerConf {
+    // Use a shorter pageserver unavailability interval than the default to speed up tests.
+    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
+        std::time::Duration::from_secs(10);
+}
+
+impl Default for NeonStorageControllerConf {
+    fn default() -> Self {
+        Self {
+            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+        }
+    }
+}
+
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
     fn default() -> Self {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index dbb4475ae8..b919b14758 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,4 +1,7 @@
-use crate::{background_process, local_env::LocalEnv};
+use crate::{
+    background_process,
+    local_env::{LocalEnv, NeonStorageControllerConf},
+};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
@@ -32,15 +35,13 @@ pub struct StorageController {
     public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
+    config: NeonStorageControllerConf,
 }
 
 const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -135,6 +136,7 @@ impl StorageController {
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
+            config: env.storage_controller.clone(),
         }
     }
 
@@ -272,8 +274,6 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
         let mut args = vec![
             "-l",
             &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
             "--database-url",
             &database_url,
             "--max-unavailable-interval",
-            &max_unavailable.to_string(),
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 882562d99f..186a820adf 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -90,7 +90,11 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+/// How long a node may be unresponsive to heartbeats before we declare it offline.
+/// This must be long enough to cover node restarts as well as normal operations: in future
+/// it should be separated into distinct timeouts for startup vs. normal operation
+/// (`<https://github.com/neondatabase/neon/issues/7552>`)
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 
@@ -4251,7 +4255,9 @@ impl Service {
     /// Check all tenants for pending reconciliation work, and reconcile those in need.
     /// Additionally, reschedule tenants that require it.
     ///
-    /// Returns how many reconciliation tasks were started
+    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
+    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
+    /// available.  A return value of 0 indicates that everything is fully reconciled already.
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4266,7 +4272,11 @@ impl Service {
             }
 
             // Skip checking if this shard is already enqueued for reconciliation
-            if shard.delayed_reconcile {
+            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
+                // If there is something delayed, then return a nonzero count so that
+                // callers like reconcile_all_now do not incorrectly get the impression
+                // that the system is in a quiescent state.
+                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
                 continue;
             }
 
@@ -4451,7 +4461,7 @@ impl Service {
             waiter_count
         );
 
-        Ok(waiter_count)
+        Ok(std::cmp::max(waiter_count, reconciles_spawned))
     }
 
     pub async fn shutdown(&self) {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 7b11dfe64d..ffbf2c4b7a 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -952,8 +952,8 @@ impl TenantShard {
 
     /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
     ///
-    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
-    /// you would like to wait until one gets spawned in the background.
+    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
+    /// you would like to wait on the next reconciler that gets spawned in the background.
     pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
         self.ensure_sequence_ahead();
 
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 9dd66fe636..a883d94f73 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -14,10 +14,18 @@ class ComputeReconfigure:
         self.server = server
         self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
         self.workloads = {}
+        self.on_notify = None
 
     def register_workload(self, workload):
         self.workloads[workload.tenant_id] = workload
 
+    def register_on_notify(self, fn):
+        """
+        Add some extra work during a notification, like sleeping to slow things down, or
+        logging what was notified.
+        """
+        self.on_notify = fn
+
 
 @pytest.fixture(scope="function")
 def compute_reconfigure_listener(make_httpserver):
@@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver):
         body: dict[str, Any] = request.json
         log.info(f"notify-attach request: {body}")
 
+        if self.on_notify is not None:
+            self.on_notify(body)
+
         try:
             workload = self.workloads[TenantId(body["tenant_id"])]
         except KeyError:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fa83ebdccb..fbd1e22aa9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -499,6 +499,7 @@ class NeonEnvBuilder:
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
         self.control_plane_compute_hook_api: Optional[str] = None
+        self.storage_controller_config: Optional[dict[Any, Any]] = None
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
@@ -1021,6 +1022,7 @@ class NeonEnv:
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
         self.pageserver_config_override = config.pageserver_config_override
+        self.storage_controller_config = config.storage_controller_config
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
@@ -1066,6 +1068,9 @@ class NeonEnv:
         if self.control_plane_compute_hook_api is not None:
             cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
 
+        if self.storage_controller_config is not None:
+            cfg["storage_controller"] = self.storage_controller_config
+
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1134,12 +1139,9 @@ class NeonEnv:
         # bounce through retries on startup
         self.storage_controller.start()
 
-        def storage_controller_ready():
-            assert self.storage_controller.ready() is True
-
         # Wait for storage controller readiness to prevent unnecessary post start-up
         # reconcile.
-        wait_until(30, 1, storage_controller_ready)
+        self.storage_controller.wait_until_ready()
 
         # Start up broker, pageserver and all safekeepers
         futs = []
@@ -2043,6 +2045,15 @@ class NeonStorageController(MetricsGetter):
         else:
             raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
 
+    def wait_until_ready(self):
+        t1 = time.time()
+
+        def storage_controller_ready():
+            assert self.ready() is True
+
+        wait_until(30, 1, storage_controller_ready)
+        return time.time() - t1
+
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
     ) -> int:
@@ -2130,7 +2141,7 @@ class NeonStorageController(MetricsGetter):
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[Dict[Any, Any]] = None,
-        placement_policy: Optional[str] = None,
+        placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
@@ -2240,10 +2251,21 @@ class NeonStorageController(MetricsGetter):
     def reconcile_until_idle(self, timeout_secs=30):
         start_at = time.time()
         n = 1
+        delay_sec = 0.5
+        delay_max = 5
         while n > 0:
             n = self.reconcile_all()
-            if time.time() - start_at > timeout_secs:
+            if n == 0:
+                break
+            elif time.time() - start_at > timeout_secs:
                 raise RuntimeError("Timeout in reconcile_until_idle")
+            else:
+                # Don't call again right away: if we're waiting for many reconciles that
+                # are blocked on the concurrency limit, it slows things down to call
+                # reconcile_all frequently.
+                time.sleep(delay_sec)
+                delay_sec *= 2
+                delay_sec = min(delay_sec, delay_max)
 
     def consistency_check(self):
         """
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
new file mode 100644
index 0000000000..17dc96dabe
--- /dev/null
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -0,0 +1,198 @@
+import concurrent.futures
+import random
+import time
+
+import pytest
+from fixtures.compute_reconfigure import ComputeReconfigure
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TenantShardId, TimelineId
+
+
+@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
+def test_storage_controller_many_tenants(
+    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
+):
+    """
+    Check that we cope well with a not-totally-trivial number of tenants.
+
+    This is checking for:
+    - Obvious concurrency bugs from issuing many tenant creations/modifications
+      concurrently.
+    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
+      a basic test starts failing from slowness.
+
+    This is _not_ a comprehensive scale test: just a basic sanity check that
+    we don't fall over for a thousand shards.
+    """
+
+    neon_env_builder.num_pageservers = 5
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
+        # guard against regressions in restart time.
+        "max_unavailable": "300s"
+    }
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+
+    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
+    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
+
+    env = neon_env_builder.init_start()
+
+    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
+    # of shards are hitting the delayed path.
+    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
+
+    for ps in env.pageservers:
+        # This can happen because when we do a loop over all pageservers and mark them offline/active,
+        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
+        # bumping generation before other attachments are detached.
+        #
+        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
+        # we spawn with a wait for the predecessor.
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+        # Storage controller is allowed to drop pageserver requests when the cancellation token
+        # for a Reconciler fires.
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+
+    # Total tenants
+    tenant_count = 4000
+
+    # Shards per tenant
+    shard_count = 2
+    stripe_size = 1024
+
+    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
+
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    def check_memory():
+        # Shards should be cheap_ in memory, as we will have very many of them
+        expect_memory_per_shard = 128 * 1024
+
+        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
+        assert rss is not None
+        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
+        assert rss < expect_memory_per_shard * shard_count * tenant_count
+
+    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
+    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
+    rng = random.Random(1234)
+
+    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
+    # permits, to ensure that we are exercising stressing that.
+    api_concurrency = 135
+
+    # We will create tenants directly via API, not via neon_local, to avoid any false
+    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
+        futs = []
+        t1 = time.time()
+        for tenant_id in tenants:
+            f = executor.submit(
+                env.storage_controller.tenant_create,
+                tenant_id,
+                shard_count,
+                stripe_size,
+                placement_policy={"Attached": 1},
+            )
+            futs.append(f)
+
+        # Wait for creations to finish
+        for f in futs:
+            f.result()
+        log.info(
+            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
+        )
+
+        run_ops = api_concurrency * 4
+        assert run_ops < len(tenants)
+        op_tenants = list(tenants)[0:run_ops]
+
+        # Generate a mixture of operations and dispatch them all concurrently
+        futs = []
+        for tenant_id in op_tenants:
+            op = rng.choice([0, 1, 2])
+            if op == 0:
+                # A fan-out write operation to all shards in a tenant (timeline creation)
+                f = executor.submit(
+                    virtual_ps_http.timeline_create,
+                    PgVersion.NOT_SET,
+                    tenant_id,
+                    TimelineId.generate(),
+                )
+            elif op == 1:
+                # A reconciler operation: migrate a shard.
+                shard_number = rng.randint(0, shard_count - 1)
+                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+                f = executor.submit(
+                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
+                )
+            elif op == 2:
+                # A passthrough read to shard zero
+                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
+
+            futs.append(f)
+
+        # Wait for mixed ops to finish
+        for f in futs:
+            f.result()
+
+    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
+    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
+    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
+    #
+    # We do not require that the system is quiescent already here, although at present in this point in the test
+    # that may be the case.
+    while True:
+        t1 = time.time()
+        reconcilers = env.storage_controller.reconcile_all()
+        if reconcilers == 0:
+            # Time how long a no-op background reconcile takes: this measures how long it takes to
+            # loop over all the shards looking for work to do.
+            runtime = time.time() - t1
+            log.info(f"No-op call to reconcile_all took {runtime}s")
+            assert runtime < 1
+            break
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    # See how long the controller takes to pass its readiness check.  This should be fast because
+    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
+    # startup.
+    readiness_period = env.storage_controller.wait_until_ready()
+    assert readiness_period < 5
+
+    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
+    # to run, as it was in a stable state before restart.  If it did, that's a bug.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Restart pageservers: this exercises the /re-attach API
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
+    # as they were not offline long enough to trigger any scheduling changes.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Stop the storage controller before tearing down fixtures, because it otherwise might log
+    # errors trying to call our `ComputeReconfigure`.
+    env.storage_controller.stop()

From e018cac1f714626b1dca7eeab8eab0951cbfaed2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 30 Apr 2024 18:00:24 +0100
Subject: [PATCH 121/157] tests: tweak log allow list in
 test_sharding_split_failures (#7549)

## Problem

This test became flaky recently with failures like:
```
AssertionError: Log errors on storage_controller: (129, '2024-04-29T16:41:03.591506Z ERROR request{method=PUT path=/control/v1/tenant/b38c0447fbdbcf4e1c023f00b0f7c221/shard_split request_id=34df4975-2ef3-4ed8-b167-2956650e365c}: Error processing HTTP request: InternalServerError(Reconcile error on shard b38c0447fbdbcf4e1c023f00b0f7c221-0002: Cancelled\n')
```

Likely due to #7508 changing how errors are reported from Reconcilers.

## Summary of changes

- Tolerate `Reconcile error.*Cancelled` log errors
---
 test_runner/regress/test_sharding.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 101d2620b0..bae5945338 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -928,6 +928,8 @@ def test_sharding_split_failures(
             ".*Reconcile error: receive body: error sending request for url.*",
             # Node offline cases will fail inside reconciler when detaching secondaries
             ".*Reconcile error on shard.*: receive body: error sending request for url.*",
+            # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
+            ".*Reconcile error.*Cancelled.*",
             # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
             ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
         ]

From fcbe60f43691b05d064b4b781e01e50ffb985d55 Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Tue, 30 Apr 2024 19:44:02 +0200
Subject: [PATCH 122/157] Makefile: DISABLE_HOMEBREW variable (#7556)

## Problem

The current Makefile assumes that homebrew is used on macos. There are
other ways to install dependencies on MacOS (nix, macports, "manually").
It would be great to allow the one who wants to use other options to
disable homebrew integration.

## Summary of changes

It adds DISABLE_HOMEBREW variable that if set skips extra
homebrew-specific configuration steps.
---
 Makefile | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index f13f080f1a..5e2b3c4367 100644
--- a/Makefile
+++ b/Makefile
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	# macOS with brew-installed openssl requires explicit paths
-	# It can be configured with OPENSSL_PREFIX variable
-	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	ifndef DISABLE_HOMEBREW
+		# macOS with brew-installed openssl requires explicit paths
+		# It can be configured with OPENSSL_PREFIX variable
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	endif
 endif
 
 # Use -C option so that when PostgreSQL "make install" installs the

From 50a45e67dc295f01c32a397a1951205666406b3f Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 30 Apr 2024 21:50:03 +0300
Subject: [PATCH 123/157] Discover safekeepers via broker request (#7279)

We had an incident where pageserver requests timed out because
pageserver couldn't fetch WAL from safekeepers. This incident was caused
by a bug in safekeeper logic for timeline activation, which prevented
pageserver from finding safekeepers.
This bug was since fixed, but there is still a chance of a similar bug
in the future due to overall complexity.

We add a new broker message to "signal interest" for timeline. This
signal will be sent by pageservers `wait_lsn`, and safekeepers will
receive this signal to start broadcasting broker messages. Then every
broker subscriber will be able to find the safekeepers and connect to
them (to start fetching WAL).

This feature is not limited to pageservers and any service that wants to
download WAL from safekeepers will be able to use this discovery
request.

This commit changes pageserver's connection_manager (walreceiver) to
send a SafekeeperDiscoveryRequest when there is no information about
safekeepers present in memory. Current implementation will send these
requests only if there is an active wait_lsn() call and no more often
than once per 10 seconds.

Add `test_broker_discovery` to test this: safekeepers started with
`--disable-periodic-broker-push` will not push info to broker so that
pageserver must use a discovery to start fetching WAL.

Add task_stats in safekeepers broker module to log a warning if there is
no message received from the broker for the last 10 seconds.

Closes #5471

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/utils/src/seqwait.rs                     | 126 +++++++++----
 pageserver/src/tenant/timeline.rs             |   6 +
 .../walreceiver/connection_manager.rs         | 164 ++++++++++++++---
 safekeeper/src/bin/safekeeper.rs              |   5 +
 safekeeper/src/broker.rs                      | 166 +++++++++++++++++-
 safekeeper/src/lib.rs                         |   2 +
 .../tests/walproposer_sim/safekeeper.rs       |   1 +
 storage_broker/src/bin/storage_broker.rs      |  12 +-
 test_runner/regress/test_wal_acceptor.py      |  48 ++++-
 9 files changed, 464 insertions(+), 66 deletions(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index 0544c5be03..375b227b99 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,11 +2,10 @@
 
 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
-use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{channel, Receiver, Sender};
+use tokio::sync::watch::{self, channel};
 use tokio::time::timeout;
 
 /// An error happened while waiting for a number
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
     fn cnt_value(&self) -> V;
 }
 
-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
+/// Heap of waiters, lowest numbers pop first.
+struct Waiters<V>
 where
-    S: MonotonicCounter<V>,
     V: Ord,
 {
-    waiters: BinaryHeap<Waiter<V>>,
-    current: S,
-    shutdown: bool,
+    heap: BinaryHeap<Waiter<V>>,
+    /// Number of the first waiter in the heap, or None if there are no waiters.
+    status_channel: watch::Sender<Option<V>>,
+}
+
+impl<V> Waiters<V>
+where
+    V: Ord + Copy,
+{
+    fn new() -> Self {
+        Waiters {
+            heap: BinaryHeap::new(),
+            status_channel: channel(None).0,
+        }
+    }
+
+    /// `status_channel` contains the number of the first waiter in the heap.
+    /// This function should be called whenever waiters heap changes.
+    fn update_status(&self) {
+        let first_waiter = self.heap.peek().map(|w| w.wake_num);
+        let _ = self.status_channel.send_replace(first_waiter);
+    }
+
+    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
+    fn add(&mut self, num: V) -> watch::Receiver<()> {
+        let (tx, rx) = channel(());
+        self.heap.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
+        self.update_status();
+        rx
+    }
+
+    /// Pop all waiters <= num from the heap. Collect channels in a vector,
+    /// so that caller can wake them up.
+    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
+        let mut wake_these = Vec::new();
+        while let Some(n) = self.heap.peek() {
+            if n.wake_num > num {
+                break;
+            }
+            wake_these.push(self.heap.pop().unwrap().wake_channel);
+        }
+        self.update_status();
+        wake_these
+    }
+
+    /// Used on shutdown to efficiently drop all waiters.
+    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
+        let heap = mem::take(&mut self.heap);
+        self.update_status();
+        heap
+    }
 }
 
 struct Waiter<T>
 where
     T: Ord,
 {
-    wake_num: T,              // wake me when this number arrives ...
-    wake_channel: Sender<()>, // ... by sending a message to this channel
+    wake_num: T,                     // wake me when this number arrives ...
+    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
 }
 
 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
 
 impl<T: Ord> Eq for Waiter<T> {}
 
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
+where
+    S: MonotonicCounter<V>,
+    V: Ord,
+{
+    waiters: Waiters<V>,
+    current: S,
+    shutdown: bool,
+}
+
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -108,7 +168,7 @@ where
     /// Create a new `SeqWait`, initialized to a particular number
     pub fn new(starting_num: S) -> Self {
         let internal = SeqWaitInt {
-            waiters: BinaryHeap::new(),
+            waiters: Waiters::new(),
             current: starting_num,
             shutdown: false,
         };
@@ -128,9 +188,8 @@ where
             // Block any future waiters from starting
             internal.shutdown = true;
 
-            // This will steal the entire waiters map.
-            // When we drop it all waiters will be woken.
-            mem::take(&mut internal.waiters)
+            // Take all waiters to drop them later.
+            internal.waiters.take_all()
 
             // Drop the lock as we exit this scope.
         };
@@ -196,7 +255,7 @@ where
 
     /// Register and return a channel that will be notified when a number arrives,
     /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
         let mut internal = self.internal.lock().unwrap();
         if internal.current.cnt_value() >= num {
             return Ok(None);
@@ -205,12 +264,8 @@ where
             return Err(SeqWaitError::Shutdown);
         }
 
-        // Create a new channel.
-        let (tx, rx) = channel(());
-        internal.waiters.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
+        // Add waiter channel to the queue.
+        let rx = internal.waiters.add(num);
         // Drop the lock as we exit this scope.
         Ok(Some(rx))
     }
@@ -231,16 +286,8 @@ where
             }
             internal.current.cnt_advance(num);
 
-            // Pop all waiters <= num from the heap. Collect them in a vector, and
-            // wake them up after releasing the lock.
-            let mut wake_these = Vec::new();
-            while let Some(n) = internal.waiters.peek() {
-                if n.wake_num > num {
-                    break;
-                }
-                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
-            }
-            wake_these
+            // Pop all waiters <= num from the heap.
+            internal.waiters.pop_leq(num)
         };
 
         for tx in wake_these {
@@ -255,6 +302,23 @@ where
     pub fn load(&self) -> S {
         self.internal.lock().unwrap().current
     }
+
+    /// Get a Receiver for the current status.
+    ///
+    /// The current status is the number of the first waiter in the queue,
+    /// or None if there are no waiters.
+    ///
+    /// This receiver will be notified whenever the status changes.
+    /// It is useful for receiving notifications when the first waiter
+    /// starts waiting for a number, or when there are no more waiters left.
+    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
+        self.internal
+            .lock()
+            .unwrap()
+            .waiters
+            .status_channel
+            .subscribe()
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2a2c5d4ee5..5537505749 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1253,6 +1253,12 @@ impl Timeline {
         self.last_record_lsn.load()
     }
 
+    /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
+    /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
+    pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
+        self.last_record_lsn.status_receiver()
+    }
+
     pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn.load()
     }
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index dae31934ad..7ef063c4e5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::SafekeeperTimelineInfo;
-use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
+};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step(
         .timeline
         .subscribe_for_state_updates();
 
+    let mut wait_lsn_status = connection_manager_state
+        .timeline
+        .subscribe_for_wait_lsn_updates();
+
+    // TODO: create a separate config option for discovery request interval
+    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
+    let mut last_discovery_ts: Option<std::time::Instant> = None;
+
     // Subscribe to the broker updates. Stream shares underlying TCP connection
     // with other streams on this client (other connection managers). When
     // object goes out of scope, stream finishes in drop() automatically.
@@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step(
 
     loop {
         let time_until_next_retry = connection_manager_state.time_until_next_retry();
+        let any_activity = connection_manager_state.wal_connection.is_some()
+            || !connection_manager_state.wal_stream_candidates.is_empty();
 
         // These things are happening concurrently:
         //
-        // - cancellation request
+        //  - cancellation request
         //  - keep receiving WAL on the current connection
         //      - if the shared state says we need to change connection, disconnect and return
         //      - this runs in a separate task and we receive updates via a watch channel
@@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step(
         //  - receive updates from broker
         //      - this might change the current desired connection
         //  - timeline state changes to something that does not allow walreceiver to run concurrently
+        //  - if there's no connection and no candidates, try to send a discovery request
 
         // NB: make sure each of the select expressions are cancellation-safe
         // (no need for arms to be cancellation-safe).
@@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step(
                     }
                 }
             } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
+
+            Some(()) = async {
+                // Reminder: this match arm needs to be cancellation-safe.
+                // Calculating time needed to wait until sending the next discovery request.
+                // Current implementation is conservative and sends discovery requests only when there are no candidates.
+
+                if any_activity {
+                    // No need to send discovery requests if there is an active connection or candidates.
+                    return None;
+                }
+
+                // Waiting for an active wait_lsn request.
+                while wait_lsn_status.borrow().is_none() {
+                    if wait_lsn_status.changed().await.is_err() {
+                        // wait_lsn_status channel was closed, exiting
+                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
+                        return None;
+                    }
+                }
+
+                // All preconditions met, preparing to send a discovery request.
+                let now = std::time::Instant::now();
+                let next_discovery_ts = last_discovery_ts
+                    .map(|ts| ts + discovery_request_interval)
+                    .unwrap_or_else(|| now);
+
+                if next_discovery_ts > now {
+                    // Prevent sending discovery requests too frequently.
+                    tokio::time::sleep(next_discovery_ts - now).await;
+                }
+
+                let tenant_timeline_id = Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                });
+                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
+                let msg = TypedMessage {
+                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+                    safekeeper_timeline_info: None,
+                    safekeeper_discovery_request: Some(request),
+                    safekeeper_discovery_response: None,
+                    };
+
+                last_discovery_ts = Some(std::time::Instant::now());
+                debug!("No active connection and no candidates, sending discovery request to the broker");
+
+                // Cancellation safety: we want to send a message to the broker, but publish_one()
+                // function can get cancelled by the other select! arm. This is absolutely fine, because
+                // we just want to receive broker updates and discovery is not important if we already
+                // receive updates.
+                //
+                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
+                // This is totally fine because of the reason above.
+
+                // This is a fire-and-forget request, we don't care about the response
+                let _ = broker_client.publish_one(msg).await;
+                debug!("Discovery request sent to the broker");
+                None
+            } => {}
         }
 
         if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates(
     broker_client: &mut BrokerClientChannel,
     id: TenantTimelineId,
     cancel: &CancellationToken,
-) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
+) -> Result<Streaming<TypedMessage>, Cancelled> {
     let mut attempt = 0;
     loop {
         exponential_backoff(
@@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates(
         attempt += 1;
 
         // subscribe to the specific timeline
-        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
-            tenant_id: id.tenant_id.as_ref().to_owned(),
-            timeline_id: id.timeline_id.as_ref().to_owned(),
-        });
-        let request = SubscribeSafekeeperInfoRequest {
-            subscription_key: Some(key),
+        let request = SubscribeByFilterRequest {
+            types: vec![
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperTimelineInfo as i32,
+                },
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                },
+            ],
+            tenant_timeline_id: Some(FilterTenantTimelineId {
+                enabled: true,
+                tenant_timeline_id: Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                }),
+            }),
         };
 
         match {
             tokio::select! {
-                r = broker_client.subscribe_safekeeper_info(request) => { r }
+                r = broker_client.subscribe_by_filter(request) => { r }
                 _ = cancel.cancelled() => { return Err(Cancelled); }
             }
         } {
@@ -398,7 +480,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperTimelineInfo,
+    timeline: SafekeeperDiscoveryResponse,
     /// Time at which the data was fetched from the broker last time, to track the stale data.
     latest_update: NaiveDateTime,
 }
@@ -606,7 +688,41 @@ impl ConnectionManagerState {
     }
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
+        let mut is_discovery = false;
+        let timeline_update = match typed_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => {
+                let info = match typed_msg.safekeeper_timeline_info {
+                    Some(info) => info,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_timeline_info");
+                        return;
+                    }
+                };
+                SafekeeperDiscoveryResponse {
+                    safekeeper_id: info.safekeeper_id,
+                    tenant_timeline_id: info.tenant_timeline_id,
+                    commit_lsn: info.commit_lsn,
+                    safekeeper_connstr: info.safekeeper_connstr,
+                    availability_zone: info.availability_zone,
+                }
+            }
+            MessageType::SafekeeperDiscoveryResponse => {
+                is_discovery = true;
+                match typed_msg.safekeeper_discovery_response {
+                    Some(response) => response,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_discovery_response");
+                        return;
+                    }
+                }
+            }
+            _ => {
+                // unexpected message
+                return;
+            }
+        };
+
         WALRECEIVER_BROKER_UPDATES.inc();
 
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -619,7 +735,11 @@ impl ConnectionManagerState {
         );
 
         if old_entry.is_none() {
-            info!("New SK node was added: {new_safekeeper_id}");
+            info!(
+                ?is_discovery,
+                %new_safekeeper_id,
+                "New SK node was added",
+            );
             WALRECEIVER_CANDIDATES_ADDED.inc();
         }
     }
@@ -818,7 +938,7 @@ impl ConnectionManagerState {
     fn select_connection_candidate(
         &self,
         node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
         self.applicable_connection_candidates()
             .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
             .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -828,7 +948,7 @@ impl ConnectionManagerState {
     /// Some safekeepers are filtered by the retry cooldown.
     fn applicable_connection_candidates(
         &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
         let now = Utc::now().naive_utc();
 
         self.wal_stream_candidates
@@ -968,19 +1088,11 @@ mod tests {
         latest_update: NaiveDateTime,
     ) -> BrokerSkTimeline {
         BrokerSkTimeline {
-            timeline: SafekeeperTimelineInfo {
+            timeline: SafekeeperDiscoveryResponse {
                 safekeeper_id: 0,
                 tenant_timeline_id: None,
-                term: 0,
-                last_log_term: 0,
-                flush_lsn: 0,
                 commit_lsn,
-                backup_lsn: 0,
-                remote_consistent_lsn: 0,
-                peer_horizon_lsn: 0,
-                local_start_lsn: 0,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
-                http_connstr: safekeeper_connstr.to_owned(),
                 availability_zone: None,
             },
             latest_update,
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e53ccaeb3d..09c565ce71 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -177,6 +177,10 @@ struct Args {
     /// Controls how long backup will wait until uploading the partial segment.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
     partial_backup_timeout: Duration,
+    /// Disable task to push messages to broker every second. Supposed to
+    /// be used in tests.
+    #[arg(long)]
+    disable_periodic_broker_push: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -309,6 +313,7 @@ async fn main() -> anyhow::Result<()> {
         walsenders_keep_horizon: args.walsenders_keep_horizon,
         partial_backup_enabled: args.partial_backup_enabled,
         partial_backup_timeout: args.partial_backup_timeout,
+        disable_periodic_broker_push: args.disable_periodic_broker_push,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 2b1db2714b..98f58d3e49 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -10,11 +10,20 @@ use anyhow::Result;
 use storage_broker::parse_proto_ttid;
 
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
+use storage_broker::proto::FilterTenantTimelineId;
+use storage_broker::proto::MessageType;
+use storage_broker::proto::SafekeeperDiscoveryResponse;
+use storage_broker::proto::SubscribeByFilterRequest;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+use storage_broker::proto::TypeSubscription;
+use storage_broker::proto::TypedMessage;
 use storage_broker::Request;
 
+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
+use std::time::UNIX_EPOCH;
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
@@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;
 
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
+    if conf.disable_periodic_broker_push {
+        info!("broker push_loop is disabled, doing nothing...");
+        futures::future::pending::<()>().await; // sleep forever
+        return Ok(());
+    }
+
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
-async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
+async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
@@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
     let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);
 
     while let Some(msg) = stream.message().await? {
+        stats.update_pulled();
+
         let proto_ttid = msg
             .tenant_timeline_id
             .as_ref()
@@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
     bail!("end of stream");
 }
 
+/// Process incoming discover requests. This is done in a separate task to avoid
+/// interfering with the normal pull/push loops.
+async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: false,
+            tenant_timeline_id: None,
+        }),
+    };
+
+    let mut stream = client
+        .subscribe_by_filter(request)
+        .await
+        .context("subscribe_by_filter request failed")?
+        .into_inner();
+
+    let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
+
+    while let Some(typed_msg) = stream.message().await? {
+        stats.update_pulled();
+
+        match typed_msg.r#type() {
+            MessageType::SafekeeperDiscoveryRequest => {
+                let msg = typed_msg
+                    .safekeeper_discovery_request
+                    .expect("proto type mismatch from broker message");
+
+                let proto_ttid = msg
+                    .tenant_timeline_id
+                    .as_ref()
+                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
+                let ttid = parse_proto_ttid(proto_ttid)?;
+                if let Ok(tli) = GlobalTimelines::get(ttid) {
+                    // we received a discovery request for a timeline we know about
+                    discover_counter.inc();
+
+                    // create and reply with discovery response
+                    let sk_info = tli.get_safekeeper_info(&conf).await;
+                    let response = SafekeeperDiscoveryResponse {
+                        safekeeper_id: sk_info.safekeeper_id,
+                        tenant_timeline_id: sk_info.tenant_timeline_id,
+                        commit_lsn: sk_info.commit_lsn,
+                        safekeeper_connstr: sk_info.safekeeper_connstr,
+                        availability_zone: sk_info.availability_zone,
+                    };
+
+                    // note this is a blocking call
+                    client
+                        .publish_one(TypedMessage {
+                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                            safekeeper_timeline_info: None,
+                            safekeeper_discovery_request: None,
+                            safekeeper_discovery_response: Some(response),
+                        })
+                        .await?;
+                }
+            }
+
+            _ => {
+                warn!(
+                    "unexpected message type i32 {}, {:?}",
+                    typed_msg.r#type,
+                    typed_msg.r#type()
+                );
+            }
+        }
+    }
+    bail!("end of stream");
+}
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
     info!("started, broker endpoint {:?}", conf.broker_endpoint);
 
     let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
     let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
     let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
+    let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
+
+    let stats = Arc::new(BrokerStats::new());
+    let stats_task = task_stats(stats.clone());
+    tokio::pin!(stats_task);
 
     // Selecting on JoinHandles requires some squats; is there a better way to
     // reap tasks individually?
@@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                     };
                     pull_handle = None;
                 },
+                res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
+                    // was it panic or normal error?
+                    match res {
+                        Ok(res_internal) => if let Err(err_inner) = res_internal {
+                            warn!("discover task failed: {:?}", err_inner);
+                        }
+                        Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
+                    };
+                    discover_handle = None;
+                },
                 _ = ticker.tick() => {
                     if push_handle.is_none() {
                         push_handle = Some(tokio::spawn(push_loop(conf.clone())));
                     }
                     if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
                     }
+                    if discover_handle.is_none() {
+                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
+                    }
+                },
+                _ = &mut stats_task => {}
+        }
+    }
+}
+
+struct BrokerStats {
+    /// Timestamp of the last received message from the broker.
+    last_pulled_ts: AtomicU64,
+}
+
+impl BrokerStats {
+    fn new() -> Self {
+        BrokerStats {
+            last_pulled_ts: AtomicU64::new(0),
+        }
+    }
+
+    fn now_millis() -> u64 {
+        std::time::SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("time is before epoch")
+            .as_millis() as u64
+    }
+
+    /// Update last_pulled timestamp to current time.
+    fn update_pulled(&self) {
+        self.last_pulled_ts
+            .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+/// Periodically write to logs if there are issues with receiving data from the broker.
+async fn task_stats(stats: Arc<BrokerStats>) {
+    let warn_duration = Duration::from_secs(10);
+    let mut ticker = tokio::time::interval(warn_duration);
+
+    loop {
+        tokio::select! {
+            _ = ticker.tick() => {
+                let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
+                if last_pulled == 0 {
+                    // no broker updates yet
+                    continue;
+                }
+
+                let now = BrokerStats::now_millis();
+                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
+                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    info!("no broker updates for some time, last update: {:?}", ts);
+                }
             }
         }
     }
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 9b4d4dbb38..543714a54e 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -83,6 +83,7 @@ pub struct SafeKeeperConf {
     pub walsenders_keep_horizon: bool,
     pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
+    pub disable_periodic_broker_push: bool,
 }
 
 impl SafeKeeperConf {
@@ -129,6 +130,7 @@ impl SafeKeeperConf {
             walsenders_keep_horizon: false,
             partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
+            disable_periodic_broker_push: false,
         }
     }
 }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index bc21c4d765..27e2a4453b 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,6 +178,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         walsenders_keep_horizon: false,
         partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
+        disable_periodic_broker_push: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 4e5f8ed724..8c88b61abc 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -196,8 +196,13 @@ impl SubscriptionKey {
 
     /// Parse from FilterTenantTimelineId
     pub fn from_proto_filter_tenant_timeline_id(
-        f: &FilterTenantTimelineId,
+        opt: Option<&FilterTenantTimelineId>,
     ) -> Result<Self, Status> {
+        if opt.is_none() {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let f = opt.unwrap();
         if !f.enabled {
             return Ok(SubscriptionKey::All);
         }
@@ -534,10 +539,7 @@ impl BrokerService for Broker {
             .remote_addr()
             .expect("TCPConnectInfo inserted by handler");
         let proto_filter = request.into_inner();
-        let ttid_filter = proto_filter
-            .tenant_timeline_id
-            .as_ref()
-            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+        let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
 
         let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
         let types_set = proto_filter
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ac1a747df3..967d133e18 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    timeline_id = env.neon_cli.create_branch("test_idle_reconnections")
 
     def collect_stats() -> Dict[str, float]:
         # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
 
     collect_stats()
 
-    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_idle_reconnections")
     # just write something to the timeline
     endpoint.safe_psql("create table t(i int)")
     collect_stats()
@@ -2007,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
     )
     log.info(f"dump_control_file response: {res}")
     assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
+
+
+# Test disables periodic pushes from safekeeper to the broker and checks that
+# pageserver can still discover safekeepers with discovery requests.
+def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_broker_discovery")
+
+    endpoint = env.endpoints.create_start(
+        "test_broker_discovery",
+        config_lines=["shared_buffers=1MB"],
+    )
+    endpoint.safe_psql("create table t(i int, payload text)")
+    # Install extension containing function needed to clear buffer
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
+
+    def do_something():
+        time.sleep(1)
+        # generate some data to commit WAL on safekeepers
+        endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
+        # clear the buffers
+        endpoint.safe_psql("select clear_buffer_cache()")
+        # read data to fetch pages from pageserver
+        endpoint.safe_psql("select sum(i) from t")
+
+    do_something()
+    do_something()
+
+    for sk in env.safekeepers:
+        # Disable periodic broker push, so pageserver won't be able to discover
+        # safekeepers without sending a discovery request
+        sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
+
+    do_something()
+    do_something()
+
+    # restart pageserver and check how everything works
+    env.pageserver.stop().start()
+
+    do_something()
+    do_something()

From 26e6ff8ba61c896cae9fd35c1683b0126203f345 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 1 May 2024 11:44:42 -0400
Subject: [PATCH 124/157] chore(pageserver): concise error message for layer
 traversal (#7565)

Instead of showing the full path of layer traversal, we now only show
tenant (in tracing context)+timeline+filename.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer.rs | 10 +++++-----
 pageserver/src/tenant/timeline.rs            |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index ee9de8de09..2b6934fcee 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -401,8 +401,8 @@ impl Layer {
         &self.0.path
     }
 
-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        &self.0.path_str
+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
     }
 
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -527,8 +527,8 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
-    /// String representation of the full path, used for traversal id.
-    path_str: Arc<str>,
+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
 
     desc: PersistentLayerDesc,
 
@@ -735,7 +735,7 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            path_str: path.to_string().into(),
+            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
             path,
             desc,
             timeline: Arc::downgrade(timeline),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5537505749..cda873d649 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2948,7 +2948,7 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Layer {
     fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
+        Arc::clone(self.debug_str())
     }
 }
 

From 5558457c84c2cb2c948989a2ac4139322dce50e3 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 1 May 2024 12:31:59 -0400
Subject: [PATCH 125/157] chore(pageserver): categorize basebackup errors
 (#7523)

close https://github.com/neondatabase/neon/issues/7391

## Summary of changes

Categorize basebackup error into two types: server error and client
error. This makes it easier to set up alerts.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/basebackup.rs   | 197 ++++++++++++++++++++++++---------
 pageserver/src/page_service.rs |  28 ++++-
 2 files changed, 166 insertions(+), 59 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 53abd8bfb9..58b18dae7d 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 
+#[derive(Debug, thiserror::Error)]
+pub enum BasebackupError {
+    #[error("basebackup pageserver error {0:#}")]
+    Server(#[from] anyhow::Error),
+    #[error("basebackup client error {0:#}")]
+    Client(#[source] io::Error),
+}
+
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     prev_lsn: Option<Lsn>,
     full_backup: bool,
     ctx: &'a RequestContext,
-) -> anyhow::Result<()>
+) -> Result<(), BasebackupError>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -92,8 +100,10 @@ where
 
     // Consolidate the derived and the provided prev_lsn values
     let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) {
-            ensure!(backup_prev == provided_prev_lsn);
+        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
+            return Err(BasebackupError::Server(anyhow!(
+                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
+            )));
         }
         provided_prev_lsn
     } else {
@@ -159,15 +169,26 @@ where
         }
     }
 
-    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
         let (kind, segno, _) = key_to_slru_block(*key)?;
 
         match kind {
             SlruKind::Clog => {
-                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid SlruKind::Clog record: block.len()={}",
+                        block.len()
+                    )));
+                }
             }
             SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                ensure!(block.len() == BLCKSZ as usize);
+                if block.len() != BLCKSZ as usize {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid {:?} record: block.len()={}",
+                        kind,
+                        block.len()
+                    )));
+                }
             }
         }
 
@@ -194,12 +215,15 @@ where
         Ok(())
     }
 
-    async fn flush(&mut self) -> anyhow::Result<()> {
+    async fn flush(&mut self) -> Result<(), BasebackupError> {
         let nblocks = self.buf.len() / BLCKSZ as usize;
         let (kind, segno) = self.current_segment.take().unwrap();
         let segname = format!("{}/{:>04X}", kind.to_str(), segno);
         let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar.append(&header, self.buf.as_slice()).await?;
+        self.ar
+            .append(&header, self.buf.as_slice())
+            .await
+            .map_err(BasebackupError::Client)?;
 
         self.total_blocks += nblocks;
         debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -209,7 +233,7 @@ where
         Ok(())
     }
 
-    async fn finish(mut self) -> anyhow::Result<()> {
+    async fn finish(mut self) -> Result<(), BasebackupError> {
         let res = if self.current_segment.is_none() || self.buf.is_empty() {
             Ok(())
         } else {
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
         // TODO include checksum
 
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -262,7 +286,8 @@ where
             let slru_partitions = self
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await?
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
                 .partition(
                     self.timeline.get_shard_identity(),
                     Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -271,10 +296,15 @@ where
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
-                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
+                let blocks = self
+                    .timeline
+                    .get_vectored(part, self.lsn, self.ctx)
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
 
                 for (key, block) in blocks {
-                    slru_builder.add_block(&key, block?).await?;
+                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    slru_builder.add_block(&key, block).await?;
                 }
             }
             slru_builder.finish().await?;
@@ -282,8 +312,11 @@ where
 
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in
-            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        for ((spcnode, dbnode), has_relmap_file) in self
+            .timeline
+            .list_dbdirs(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
@@ -292,7 +325,8 @@ where
             let rels = self
                 .timeline
                 .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
                 // contents of UNLOGGED relations. Postgres copies it in
@@ -315,7 +349,12 @@ where
                 }
             }
 
-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+            for (path, content) in self
+                .timeline
+                .list_aux_files(self.lsn, self.ctx)
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
+            {
                 if path.starts_with("pg_replslot") {
                     let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                     let restart_lsn = Lsn(u64::from_le_bytes(
@@ -346,34 +385,41 @@ where
         for xid in self
             .timeline
             .list_twophase_files(self.lsn, self.ctx)
-            .await?
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
         {
             self.add_twophase_file(xid).await?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
-            bail!("failpoint basebackup-before-control-file")
+            Err(BasebackupError::Server(anyhow!(
+                "failpoint basebackup-before-control-file"
+            )))
         });
 
         // Generate pg_control and bootstrap WAL segment.
         self.add_pgcontrol_file().await?;
-        self.ar.finish().await?;
+        self.ar.finish().await.map_err(BasebackupError::Client)?;
         debug!("all tarred up!");
         Ok(())
     }
 
     /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
         let nblocks = self
             .timeline
             .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
             let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
             return Ok(());
         }
 
@@ -388,13 +434,17 @@ where
                 let img = self
                     .timeline
                     .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await?;
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
             let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar.append(&header, segment_data.as_slice()).await?;
+            self.ar
+                .append(&header, segment_data.as_slice())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             seg += 1;
             startblk = endblk;
@@ -414,20 +464,22 @@ where
         spcnode: u32,
         dbnode: u32,
         has_relmap_file: bool,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), BasebackupError> {
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
                 .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
 
-            ensure!(
-                img.len()
-                    == dispatch_pgversion!(
-                        self.timeline.pg_version,
-                        pgv::bindings::SIZEOF_RELMAPFILE
-                    )
-            );
+            if img.len()
+                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
+            {
+                return Err(BasebackupError::Server(anyhow!(
+                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
+                    img.len(),
+                )));
+            }
 
             Some(img)
         } else {
@@ -440,14 +492,20 @@ where
                 ver => format!("{ver}\x0A"),
             };
             let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes()).await?;
+            self.ar
+                .append(&header, pg_version_str.as_bytes())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
             if let Some(img) = relmap_img {
                 // filenode map for global tablespace
                 let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -466,18 +524,26 @@ where
                 && self
                     .timeline
                     .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await?
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?
                     .is_empty()
             {
                 return Ok(());
             }
             // User defined tablespaces are not supported
-            ensure!(spcnode == DEFAULTTABLESPACE_OID);
+            if spcnode != DEFAULTTABLESPACE_OID {
+                return Err(BasebackupError::Server(anyhow!(
+                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
+                )));
+            }
 
             // Append dir path for each database
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -487,11 +553,17 @@ where
                     ver => format!("{ver}\x0A"),
                 };
                 let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes()).await?;
+                self.ar
+                    .append(&header, pg_version_str.as_bytes())
+                    .await
+                    .map_err(BasebackupError::Client)?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
             }
         };
         Ok(())
@@ -500,11 +572,12 @@ where
     //
     // Extract twophase state files
     //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -512,7 +585,10 @@ where
         buf.put_u32_le(crc);
         let path = format!("pg_twophase/{:>08X}", xid);
         let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..]).await?;
+        self.ar
+            .append(&header, &buf[..])
+            .await
+            .map_err(BasebackupError::Client)?;
 
         Ok(())
     }
@@ -521,24 +597,28 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
             if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")?;
+                write!(zenith_signal, "PREV LSN: none")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
-                write!(zenith_signal, "PREV LSN: invalid")?;
+                write!(zenith_signal, "PREV LSN: invalid")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
             }
         } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+                .map_err(|e| BasebackupError::Server(e.into()))?;
         }
         self.ar
             .append(
                 &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                 zenith_signal.as_bytes(),
             )
-            .await?;
+            .await
+            .map_err(BasebackupError::Client)?;
 
         let checkpoint_bytes = self
             .timeline
@@ -560,7 +640,10 @@ where
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..]).await?;
+        self.ar
+            .append(&header, &pg_control_bytes[..])
+            .await
+            .map_err(BasebackupError::Client)?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -575,8 +658,16 @@ where
             self.lsn,
         )
         .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..]).await?;
+        if wal_seg.len() != WAL_SEGMENT_SIZE {
+            return Err(BasebackupError::Server(anyhow!(
+                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
+                wal_seg.len()
+            )));
+        }
+        self.ar
+            .append(&header, &wal_seg[..])
+            .await
+            .map_err(BasebackupError::Client)?;
         Ok(())
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 96d2397c94..f6b251283c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,6 +48,7 @@ use utils::{
 
 use crate::auth::check_permission;
 use crate::basebackup;
+use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -1236,6 +1237,13 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
+        fn map_basebackup_error(err: BasebackupError) -> QueryError {
+            match err {
+                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
+                BasebackupError::Server(e) => QueryError::Other(e),
+            }
+        }
+
         let started = std::time::Instant::now();
 
         // check that the timeline exists
@@ -1261,7 +1269,8 @@ impl PageServerHandler {
         let lsn_awaited_after = started.elapsed();
 
         // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
+            .map_err(QueryError::Disconnected)?;
         self.flush_cancellable(pgb, &timeline.cancel).await?;
 
         // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1276,7 +1285,8 @@ impl PageServerHandler {
                 full_backup,
                 ctx,
             )
-            .await?;
+            .await
+            .map_err(map_basebackup_error)?;
         } else {
             let mut writer = pgb.copyout_writer();
             if gzip {
@@ -1297,9 +1307,13 @@ impl PageServerHandler {
                     full_backup,
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
                 // shutdown the encoder to ensure the gzip footer is written
-                encoder.shutdown().await?;
+                encoder
+                    .shutdown()
+                    .await
+                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
             } else {
                 basebackup::send_basebackup_tarball(
                     &mut writer,
@@ -1309,11 +1323,13 @@ impl PageServerHandler {
                     full_backup,
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
             }
         }
 
-        pgb.write_message_noflush(&BeMessage::CopyDone)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)
+            .map_err(QueryError::Disconnected)?;
         self.flush_cancellable(pgb, &timeline.cancel).await?;
 
         let basebackup_after = started

From d43d77389e3d38408ec74d7f30b243d1c181569b Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 1 May 2024 21:36:50 -0700
Subject: [PATCH 126/157] Add retry loops and bump test timeout in
 test_pageserver_connection_stress (#7281)

---
 test_runner/regress/test_bad_connection.py | 23 ++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index c808fa0f54..82a3a05c2b 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -1,10 +1,13 @@
 import random
 import time
 
+import psycopg2.errors
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
+@pytest.mark.timeout(600)
 def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.allowed_errors.append(".*simulated connection error.*")
@@ -20,12 +23,20 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
+    def execute_retry_on_timeout(query):
+        while True:
+            try:
+                cur.execute(query)
+                return
+            except psycopg2.errors.QueryCanceled:
+                log.info(f"Query '{query}' timed out - retrying")
+
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
     # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
+    execute_retry_on_timeout("CREATE TABLE foo (t text)")
+    execute_retry_on_timeout(
         """
         INSERT INTO foo
             SELECT 'long string to consume some space' || g
@@ -34,7 +45,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     )
 
     # Verify that the table is larger than shared_buffers
-    cur.execute(
+    execute_retry_on_timeout(
         """
         select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
@@ -45,16 +56,16 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
     assert int(row[0]) < int(row[1])
 
-    cur.execute("SELECT count(*) FROM foo")
+    execute_retry_on_timeout("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
 
     end_time = time.time() + 30
     times_executed = 0
     while time.time() < end_time:
         if random.random() < 0.5:
-            cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
+            execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')")
         else:
-            cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
+            execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
             cur.fetchall()
         times_executed += 1
     log.info(f"Workload executed {times_executed} times")

From cb4b4750badbbe02a2b8000f0df3a490cc3664c1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 2 May 2024 10:16:04 +0100
Subject: [PATCH 127/157] update to reqwest 0.12 (#7561)

## Problem

#7557

## Summary of changes
---
 Cargo.lock                              | 282 +++++++++++++++++-------
 Cargo.toml                              |  13 +-
 control_plane/src/storage_controller.rs |   4 +-
 control_plane/storcon_cli/src/main.rs   |   5 +-
 pageserver/src/control_plane_client.rs  |   2 +-
 proxy/Cargo.toml                        |   5 +-
 proxy/src/http.rs                       |  17 +-
 proxy/src/proxy/wake_compute.rs         |   2 +-
 storage_controller/Cargo.toml           |   2 +-
 storage_controller/src/compute_hook.rs  |  36 ++-
 storage_controller/src/http.rs          |  25 ++-
 storage_controller/src/node.rs          |   2 +-
 storage_controller/src/reconciler.rs    |   2 +-
 storage_controller/src/service.rs       |   2 +-
 workspace_hack/Cargo.toml               |   3 +-
 15 files changed, 273 insertions(+), 129 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e4bf71c64f..775a0d977d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -595,7 +595,7 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "hyper 0.14.26",
- "hyper-rustls",
+ "hyper-rustls 0.24.0",
  "once_cell",
  "pin-project-lite",
  "pin-utils",
@@ -684,7 +684,7 @@ dependencies = [
  "http-body 0.4.5",
  "hyper 0.14.26",
  "itoa",
- "matchit",
+ "matchit 0.7.0",
  "memchr",
  "mime",
  "percent-encoding",
@@ -740,7 +740,7 @@ dependencies = [
  "pin-project",
  "quick-xml",
  "rand 0.8.5",
- "reqwest",
+ "reqwest 0.11.19",
  "rustc_version",
  "serde",
  "serde_json",
@@ -865,6 +865,12 @@ version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
+[[package]]
+name = "base64"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
+
 [[package]]
 name = "base64-simd"
 version = "0.8.0"
@@ -1210,7 +1216,7 @@ dependencies = [
  "postgres",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "rust-ini",
  "serde",
  "serde_json",
@@ -1329,7 +1335,7 @@ dependencies = [
  "postgres_backend",
  "postgres_connection",
  "regex",
- "reqwest",
+ "reqwest 0.12.4",
  "safekeeper_api",
  "scopeguard",
  "serde",
@@ -2363,6 +2369,17 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "hostname"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows 0.52.0",
+]
+
 [[package]]
 name = "http"
 version = "0.2.9"
@@ -2509,6 +2526,7 @@ dependencies = [
  "pin-project-lite",
  "smallvec",
  "tokio",
+ "want",
 ]
 
 [[package]]
@@ -2526,6 +2544,23 @@ dependencies = [
  "tokio-rustls 0.24.0",
 ]
 
+[[package]]
+name = "hyper-rustls"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
+dependencies = [
+ "futures-util",
+ "http 1.1.0",
+ "hyper 1.2.0",
+ "hyper-util",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
@@ -2573,6 +2608,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
 dependencies = [
  "bytes",
+ "futures-channel",
  "futures-util",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -2580,6 +2616,9 @@ dependencies = [
  "pin-project-lite",
  "socket2 0.5.5",
  "tokio",
+ "tower",
+ "tower-service",
+ "tracing",
 ]
 
 [[package]]
@@ -2593,7 +2632,7 @@ dependencies = [
  "iana-time-zone-haiku",
  "js-sys",
  "wasm-bindgen",
- "windows",
+ "windows 0.48.0",
 ]
 
 [[package]]
@@ -2916,6 +2955,12 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
 
+[[package]]
+name = "matchit"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
+
 [[package]]
 name = "md-5"
 version = "0.10.5"
@@ -3049,16 +3094,6 @@ version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
-[[package]]
-name = "mime_guess"
-version = "2.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
-dependencies = [
- "mime",
- "unicase",
-]
-
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -3402,7 +3437,7 @@ dependencies = [
  "bytes",
  "http 0.2.9",
  "opentelemetry_api",
- "reqwest",
+ "reqwest 0.11.19",
 ]
 
 [[package]]
@@ -3420,7 +3455,7 @@ dependencies = [
  "opentelemetry_api",
  "opentelemetry_sdk",
  "prost",
- "reqwest",
+ "reqwest 0.11.19",
  "thiserror",
  "tokio",
  "tonic",
@@ -3649,7 +3684,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "rpds",
  "scopeguard",
  "serde",
@@ -3719,7 +3754,7 @@ dependencies = [
  "futures",
  "pageserver_api",
  "postgres",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "thiserror",
  "tokio",
@@ -4328,7 +4363,7 @@ dependencies = [
  "hashlink",
  "hex",
  "hmac",
- "hostname",
+ "hostname 0.3.1",
  "http 1.1.0",
  "http-body-util",
  "humantime",
@@ -4361,7 +4396,7 @@ dependencies = [
  "redis",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
@@ -4388,6 +4423,7 @@ dependencies = [
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
  "tokio-util",
+ "tower-service",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -4703,69 +4739,106 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "hyper 0.14.26",
- "hyper-rustls",
  "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "mime_guess",
  "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.11",
- "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
- "wasm-streams",
+ "wasm-streams 0.3.0",
  "web-sys",
- "webpki-roots 0.25.2",
- "winreg",
+ "winreg 0.50.0",
+]
+
+[[package]]
+name = "reqwest"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
+dependencies = [
+ "base64 0.22.0",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-rustls 0.26.0",
+ "hyper-util",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "tokio-util",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams 0.4.0",
+ "web-sys",
+ "webpki-roots 0.26.1",
+ "winreg 0.52.0",
 ]
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.2.2"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
+checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
 dependencies = [
  "anyhow",
  "async-trait",
- "http 0.2.9",
- "reqwest",
+ "http 1.1.0",
+ "reqwest 0.12.4",
  "serde",
- "task-local-extensions",
  "thiserror",
+ "tower-service",
 ]
 
 [[package]]
 name = "reqwest-retry"
-version = "0.2.2"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
+checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
 dependencies = [
  "anyhow",
  "async-trait",
  "chrono",
  "futures",
  "getrandom 0.2.11",
- "http 0.2.9",
- "hyper 0.14.26",
+ "http 1.1.0",
+ "hyper 1.2.0",
  "parking_lot 0.11.2",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
  "retry-policies",
- "task-local-extensions",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -4773,27 +4846,27 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.7"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
+checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3"
 dependencies = [
  "anyhow",
  "async-trait",
  "getrandom 0.2.11",
- "matchit",
+ "http 1.1.0",
+ "matchit 0.8.2",
  "opentelemetry",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
- "task-local-extensions",
  "tracing",
  "tracing-opentelemetry",
 ]
 
 [[package]]
 name = "retry-policies"
-version = "0.1.2"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
+checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
 dependencies = [
  "anyhow",
  "chrono",
@@ -5119,7 +5192,7 @@ dependencies = [
  "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "serde_with",
@@ -5170,7 +5243,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "safekeeper_api",
  "scopeguard",
  "sd-notify",
@@ -5300,12 +5373,12 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
 [[package]]
 name = "sentry"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
+checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
- "reqwest",
+ "reqwest 0.12.4",
  "rustls 0.21.11",
  "sentry-backtrace",
  "sentry-contexts",
@@ -5319,9 +5392,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-backtrace"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9"
+checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
 dependencies = [
  "backtrace",
  "once_cell",
@@ -5331,11 +5404,11 @@ dependencies = [
 
 [[package]]
 name = "sentry-contexts"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a"
+checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname",
+ "hostname 0.4.0",
  "libc",
  "os_info",
  "rustc_version",
@@ -5345,9 +5418,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-core"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055"
+checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
 dependencies = [
  "once_cell",
  "rand 0.8.5",
@@ -5358,9 +5431,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-panic"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7"
+checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -5368,9 +5441,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-tracing"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3"
+checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -5380,13 +5453,13 @@ dependencies = [
 
 [[package]]
 name = "sentry-types"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd"
+checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
 dependencies = [
  "debugid",
- "getrandom 0.2.11",
  "hex",
+ "rand 0.8.5",
  "serde",
  "serde_json",
  "thiserror",
@@ -5778,7 +5851,7 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "r2d2",
- "reqwest",
+ "reqwest 0.12.4",
  "routerify",
  "serde",
  "serde_json",
@@ -5800,7 +5873,7 @@ dependencies = [
  "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "thiserror",
@@ -6500,12 +6573,14 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.20.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
+checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
 dependencies = [
  "once_cell",
  "opentelemetry",
+ "opentelemetry_sdk",
+ "smallvec",
  "tracing",
  "tracing-core",
  "tracing-log",
@@ -6551,7 +6626,7 @@ dependencies = [
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
- "reqwest",
+ "reqwest 0.12.4",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
@@ -6637,15 +6712,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "unicase"
-version = "2.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
-dependencies = [
- "version_check",
-]
-
 [[package]]
 name = "unicode-bidi"
 version = "0.3.13"
@@ -7004,6 +7070,19 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "wasm-streams"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-timer"
 version = "0.2.5"
@@ -7044,6 +7123,15 @@ version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
 
+[[package]]
+name = "webpki-roots"
+version = "0.26.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "which"
 version = "4.4.0"
@@ -7095,6 +7183,25 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.52.4",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.4",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.42.0"
@@ -7327,6 +7434,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "winreg"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -7376,7 +7493,8 @@ dependencies = [
  "regex",
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
- "reqwest",
+ "reqwest 0.11.19",
+ "reqwest 0.12.4",
  "rustls 0.21.11",
  "scopeguard",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index 32a0bc23e6..a6d406dc2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -130,10 +130,10 @@ prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
-reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
-reqwest-middleware = "0.2.0"
-reqwest-retry = "0.2.2"
+reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
+reqwest-middleware = "0.3.0"
+reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -143,7 +143,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -177,9 +177,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
+tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.20.0"
+tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index b919b14758..f1c43f4036 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,7 +3,6 @@ use crate::{
     local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
-use hyper::Method;
 use pageserver_api::{
     controller_api::{
         NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -17,6 +16,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
+use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{fs, str::FromStr};
 use tokio::process::Command;
@@ -379,7 +379,7 @@ impl StorageController {
     /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
-        method: hyper::Method,
+        method: reqwest::Method,
         path: String,
         body: Option<RQ>,
     ) -> anyhow::Result<RS>
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b3d1f0be05..c19bc96cdb 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,7 +1,6 @@
 use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
-use hyper::{Method, StatusCode};
 use pageserver_api::{
     controller_api::{
         NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
@@ -14,7 +13,7 @@ use pageserver_api::{
     shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
+use reqwest::{Method, StatusCode, Url};
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 
@@ -232,7 +231,7 @@ impl Client {
     /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
-        method: hyper::Method,
+        method: Method,
         path: String,
         body: Option<RQ>,
     ) -> mgmt_api::Result<RS>
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index f0ed46ce23..db0032891e 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -65,7 +65,7 @@ impl ControlPlaneClient {
         let mut client = reqwest::ClientBuilder::new();
 
         if let Some(jwt) = &conf.control_plane_api_token {
-            let mut headers = hyper::HeaderMap::new();
+            let mut headers = reqwest::header::HeaderMap::new();
             headers.insert(
                 "Authorization",
                 format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 6b8f2ecbf4..0e8d03906b 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -59,8 +59,8 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-reqwest = { workspace = true, features = ["json"] }
-reqwest-middleware.workspace = true
+reqwest.workspace = true
+reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
@@ -84,6 +84,7 @@ tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
+tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e20488e23c..fc7400869f 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,7 +4,7 @@
 
 pub mod health_server;
 
-use std::{sync::Arc, time::Duration};
+use std::{str::FromStr, sync::Arc, time::Duration};
 
 use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
@@ -103,12 +103,12 @@ impl Endpoint {
     }
 }
 
-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
+use hyper_util::client::legacy::connect::dns::{
+    GaiResolver as HyperGaiResolver, Name as HyperName,
 };
-use reqwest::dns::{Addrs, Resolve, Resolving};
+use reqwest::dns::{Addrs, Name, Resolve, Resolving};
+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use tower_service::Service;
 #[derive(Debug)]
 pub struct GaiResolver(HyperGaiResolver);
 
@@ -121,11 +121,12 @@ impl Default for GaiResolver {
 impl Resolve for GaiResolver {
     fn resolve(&self, name: Name) -> Resolving {
         let this = &mut self.0.clone();
+        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
         let start = Instant::now();
         Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
+            Service::<HyperName>::call(this, hyper_name).map(move |result| {
                 let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
                 result
                     .map(|addrs| -> Addrs { Box::new(addrs) })
                     .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index cb9a293413..3d9e94dd72 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -6,7 +6,7 @@ use crate::metrics::{
     WakeupFailureKind,
 };
 use crate::proxy::retry::retry_after;
-use hyper::StatusCode;
+use hyper1::StatusCode;
 use std::ops::ControlFlow;
 use tracing::{error, info, warn};
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 165cafaf4e..789420f2b0 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -31,7 +31,7 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 44a156a5ec..9d326ef82d 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -4,7 +4,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
-use hyper::{Method, StatusCode};
+use hyper::StatusCode;
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
@@ -328,7 +328,7 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = self.client.request(Method::PUT, url);
+        let req = self.client.request(reqwest::Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
@@ -347,8 +347,10 @@ impl ComputeHook {
         };
 
         // Treat all 2xx responses as success
-        if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
-            if response.status() != StatusCode::OK {
+        if response.status() >= reqwest::StatusCode::OK
+            && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES
+        {
+            if response.status() != reqwest::StatusCode::OK {
                 // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
                 // log a warning.
                 tracing::warn!(
@@ -362,7 +364,7 @@ impl ComputeHook {
 
         // Error response codes
         match response.status() {
-            StatusCode::TOO_MANY_REQUESTS => {
+            reqwest::StatusCode::TOO_MANY_REQUESTS => {
                 // TODO: 429 handling should be global: set some state visible to other requests
                 // so that they will delay before starting, rather than all notifications trying
                 // once before backing off.
@@ -371,20 +373,30 @@ impl ComputeHook {
                     .ok();
                 Err(NotifyError::SlowDown)
             }
-            StatusCode::LOCKED => {
+            reqwest::StatusCode::LOCKED => {
                 // We consider this fatal, because it's possible that the operation blocking the control one is
                 // also the one that is waiting for this reconcile.  We should let the reconciler calling
                 // this hook fail, to give control plane a chance to un-lock.
                 tracing::info!("Control plane reports tenant is locked, dropping out of notify");
                 Err(NotifyError::Busy)
             }
-            StatusCode::SERVICE_UNAVAILABLE
-            | StatusCode::GATEWAY_TIMEOUT
-            | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
-            StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
-                Err(NotifyError::Fatal(response.status()))
+            reqwest::StatusCode::SERVICE_UNAVAILABLE => {
+                Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE))
             }
-            _ => Err(NotifyError::Unexpected(response.status())),
+            reqwest::StatusCode::GATEWAY_TIMEOUT => {
+                Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT))
+            }
+            reqwest::StatusCode::BAD_GATEWAY => {
+                Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY))
+            }
+
+            reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)),
+            reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)),
+            reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)),
+            status => Err(NotifyError::Unexpected(
+                hyper::StatusCode::from_u16(status.as_u16())
+                    .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR),
+            )),
         }
     }
 
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 09a25a5be0..f9a79afb0d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -4,6 +4,7 @@ use crate::metrics::{
 };
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
@@ -258,6 +259,12 @@ async fn handle_tenant_time_travel_remote_storage(
     json_response(StatusCode::OK, ())
 }
 
+fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result<hyper::StatusCode, ApiError> {
+    hyper::StatusCode::from_u16(status.as_u16())
+        .context("invalid status code")
+        .map_err(ApiError::InternalServerError)
+}
+
 async fn handle_tenant_secondary_download(
     service: Arc<Service>,
     req: Request<Body>,
@@ -266,7 +273,7 @@ async fn handle_tenant_secondary_download(
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
 
     let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(status, progress)
+    json_response(map_reqwest_hyper_status(status)?, progress)
 }
 
 async fn handle_tenant_delete(
@@ -277,7 +284,10 @@ async fn handle_tenant_delete(
     check_permissions(&req, Scope::PageServerApi)?;
 
     deletion_wrapper(service, move |service| async move {
-        service.tenant_delete(tenant_id).await
+        service
+            .tenant_delete(tenant_id)
+            .await
+            .and_then(map_reqwest_hyper_status)
     })
     .await
 }
@@ -308,7 +318,10 @@ async fn handle_tenant_timeline_delete(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     deletion_wrapper(service, move |service| async move {
-        service.tenant_timeline_delete(tenant_id, timeline_id).await
+        service
+            .tenant_timeline_delete(tenant_id, timeline_id)
+            .await
+            .and_then(map_reqwest_hyper_status)
     })
     .await
 }
@@ -371,11 +384,9 @@ async fn handle_tenant_timeline_passthrough(
     }
 
     // We have a reqest::Response, would like a http::Response
-    let mut builder = hyper::Response::builder()
-        .status(resp.status())
-        .version(resp.version());
+    let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
     for (k, v) in resp.headers() {
-        builder = builder.header(k, v);
+        builder = builder.header(k.as_str(), v.as_bytes());
     }
 
     let response = builder
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 7ba6828deb..7b5513c908 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -1,6 +1,5 @@
 use std::{str::FromStr, time::Duration};
 
-use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
@@ -9,6 +8,7 @@ use pageserver_api::{
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api;
+use reqwest::StatusCode;
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index f38905b424..fe97f724c1 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,12 +1,12 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
-use hyper::StatusCode;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
+use reqwest::StatusCode;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 186a820adf..f26122e646 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -21,7 +21,6 @@ use control_plane::storage_controller::{
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
@@ -33,6 +32,7 @@ use pageserver_api::{
     },
     models::{SecondaryProgress, TenantConfigRequest},
 };
+use reqwest::StatusCode;
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index d6e2cc2996..a225984688 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -57,7 +57,8 @@ rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
+reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }

From 25af32e8345d04db3ea26617771caae54be767da Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 2 May 2024 11:50:11 +0200
Subject: [PATCH 128/157] proxy: keep track on the number of events from redis
 by type. (#7582)

## Problem

It's unclear what is the distribution of messages, proxy is consuming
from redis.

## Summary of changes

Add counter.
---
 proxy/src/cache/endpoints.rs     | 14 +++++++++++++-
 proxy/src/metrics.rs             | 14 ++++++++++++++
 proxy/src/redis/notifications.rs | 17 ++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 02511e6ff7..4bc10a6020 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -21,7 +21,7 @@ use crate::{
     config::EndpointCacheConfig,
     context::RequestMonitoring,
     intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::{Metrics, RedisErrors},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
     rate_limiter::GlobalRateLimiter,
     redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
     EndpointId,
@@ -100,14 +100,26 @@ impl EndpointsCache {
         if let Some(endpoint_created) = key.endpoint_created {
             self.endpoints
                 .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::EndpointCreated);
         }
         if let Some(branch_created) = key.branch_created {
             self.branches
                 .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::BranchCreated);
         }
         if let Some(project_created) = key.project_created {
             self.projects
                 .insert(ProjectIdInt::from(&project_created.project_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::ProjectCreated);
         }
     }
     pub async fn do_read(
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index c129ece059..4a54857012 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -123,6 +123,9 @@ pub struct ProxyMetrics {
     /// Number of retries (per outcome, per retry_type).
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
     pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
+
+    /// Number of events consumed from redis (per event type).
+    pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
 }
 
 #[derive(MetricGroup)]
@@ -530,3 +533,14 @@ pub enum RetryType {
     WakeCompute,
     ConnectToCompute,
 }
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "event")]
+pub enum RedisEventsCount {
+    EndpointCreated,
+    BranchCreated,
+    ProjectCreated,
+    CancelSession,
+    PasswordUpdate,
+    AllowedIpsUpdate,
+}
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 5a38530faf..ba4dfb755e 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -11,7 +11,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::{Metrics, RedisErrors},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -118,6 +118,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     "session_id",
                     &tracing::field::display(cancel_session.session_id),
                 );
+                Metrics::get()
+                    .proxy
+                    .redis_events_count
+                    .inc(RedisEventsCount::CancelSession);
                 if let Some(cancel_region) = cancel_session.region_id {
                     // If the message is not for this region, ignore it.
                     if cancel_region != self.region_id {
@@ -138,6 +142,17 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
+                if matches!(msg, AllowedIpsUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedIpsUpdate);
+                } else if matches!(msg, PasswordUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::PasswordUpdate);
+                }
                 // It might happen that the invalid entry is on the way to be cached.
                 // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
                 // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.

From 69bf1bae7def8a3f86572f5dd34ab4069614b87b Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 2 May 2024 12:52:30 +0100
Subject: [PATCH 129/157] Fix usage of pg_waldump --ignore option (#7578)

Previously, the --ignore option was only used when reading from a single
file.
With this PR pg_waldump -i is enough to open any neon WAL segments
---
 test_runner/regress/test_pg_waldump.py | 46 ++++++++++++++++++++++++++
 vendor/postgres-v14                    |  2 +-
 vendor/postgres-v15                    |  2 +-
 vendor/postgres-v16                    |  2 +-
 vendor/revisions.json                  |  6 ++--
 5 files changed, 52 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_pg_waldump.py

diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
new file mode 100644
index 0000000000..1973aa5952
--- /dev/null
+++ b/test_runner/regress/test_pg_waldump.py
@@ -0,0 +1,46 @@
+import os
+
+from fixtures.neon_fixtures import NeonEnv, PgBin
+from fixtures.utils import subprocess_capture
+
+
+# Simple test to check that pg_waldump works with neon WAL files
+def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pg_waldump", "empty")
+    endpoint = env.endpoints.create_start("test_pg_waldump")
+
+    cur = endpoint.connect().cursor()
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        ROLLBACK;
+    """
+    )
+
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        COMMIT;
+    """
+    )
+
+    # stop the endpoint to make sure that WAL files are flushed and won't change
+    endpoint.stop()
+
+    assert endpoint.pgdata_dir
+    wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
+    pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
+
+    # use special --ignore option to ignore the validation checks in pg_waldump
+    # this is necessary, because neon WAL files contain gap at the beginning
+    output_path, _, _ = subprocess_capture(test_output_dir, [pg_waldump_path, "--ignore", wal_path])
+
+    with open(f"{output_path}.stdout", "r") as f:
+        stdout = f.read()
+        assert "ABORT" in stdout
+        assert "COMMIT" in stdout
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index d9149dc59a..d6f7e2c604 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit d9149dc59abcbeeb26293707509aef51752db28f
+Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 85d809c124..f0d6b0ef75 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 85d809c124a898847a97d66a211f7d5ef4f8e0cb
+Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 261497dd63..8ef3c33aa0 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 261497dd63ace434045058b1453bcbaaa83f23e5
+Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46
diff --git a/vendor/revisions.json b/vendor/revisions.json
index dfc0aa04c3..a353fde8fd 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5",
-  "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb",
-  "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f"
+  "postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46",
+  "postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a",
+  "postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"
 }

From f656db09a4c0bc65fc249fd63c2d5c276f1860fa Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 2 May 2024 09:19:45 -0400
Subject: [PATCH 130/157] fix(pageserver): properly propagate missing key error
 for vectored get (#7569)

Some part of the code requires missing key error to be propagated to the
code path correctly (i.e., aux key range scan). Currently, it's an
anyhow error.

* remove `stuck_lsn` from the missing key error.
* as a result, when matching missing key, we do not distinguish the case
`stuck_lsn = false/true`.
* vectored get now use the unified missing key error.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs       | 55 +++++++++++--------------
 test_runner/regress/test_lsn_mapping.py |  5 +--
 2 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cda873d649..3c0a300a9a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -464,7 +464,6 @@ pub(crate) enum PageReconstructError {
 
 #[derive(Debug)]
 pub struct MissingKeyError {
-    stuck_at_lsn: bool,
     key: Key,
     shard: ShardNumber,
     cont_lsn: Lsn,
@@ -476,23 +475,13 @@ pub struct MissingKeyError {
 
 impl std::fmt::Display for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.stuck_at_lsn {
-            // Records are found in this timeline but no image layer or initial delta record was found.
-            write!(
-                f,
-                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
-                self.key, self.shard, self.cont_lsn, self.request_lsn
-            )?;
-            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
-                write!(f, ", ancestor {}", ancestor_lsn)?;
-            }
-        } else {
-            // No records in this timeline.
-            write!(
-                f,
-                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                self.key, self.shard, self.cont_lsn, self.request_lsn
-            )?;
+        write!(
+            f,
+            "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
+            self.key, self.shard, self.cont_lsn, self.request_lsn
+        )?;
+        if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+            write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
         if !self.traversal_path.is_empty() {
@@ -568,8 +557,8 @@ pub(crate) enum GetVectoredError {
     #[error("Requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
 
-    #[error("Requested key {0} not found")]
-    MissingKey(Key),
+    #[error("Requested key not found: {0}")]
+    MissingKey(MissingKeyError),
 
     #[error(transparent)]
     GetReadyAncestorError(GetReadyAncestorError),
@@ -678,7 +667,7 @@ impl From<GetVectoredError> for PageReconstructError {
             GetVectoredError::Cancelled => PageReconstructError::Cancelled,
             GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
             err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
-            err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
+            GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
             GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
             GetVectoredError::Other(err) => PageReconstructError::Other(err),
         }
@@ -1050,15 +1039,12 @@ impl Timeline {
                         return Err(GetVectoredError::Cancelled)
                     }
                     // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
-                    Err(MissingKey(MissingKeyError {
-                        stuck_at_lsn: false,
-                        ..
-                    })) if !NON_INHERITED_RANGE.contains(&key) => {
+                    Err(MissingKey(err)) if !NON_INHERITED_RANGE.contains(&key) => {
                         // The vectored read path handles non inherited keys specially.
                         // If such a a key cannot be reconstructed from the current timeline,
                         // the vectored read path returns a key level error as opposed to a top
                         // level error.
-                        return Err(GetVectoredError::MissingKey(key));
+                        return Err(GetVectoredError::MissingKey(err));
                     }
                     Err(Other(err))
                         if err
@@ -1154,7 +1140,7 @@ impl Timeline {
             match (lhs, rhs) {
                 (Oversized(l), Oversized(r)) => l == r,
                 (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l.key == r.key,
                 (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
                 (Other(_), Other(_)) => true,
                 _ => false,
@@ -3024,7 +3010,6 @@ impl Timeline {
                             // Didn't make any progress in last iteration. Error out to avoid
                             // getting stuck in the loop.
                             return Err(PageReconstructError::MissingKey(MissingKeyError {
-                                stuck_at_lsn: true,
                                 key,
                                 shard: self.shard_identity.get_shard_number(&key),
                                 cont_lsn: Lsn(cont_lsn.0 - 1),
@@ -3039,7 +3024,6 @@ impl Timeline {
                 }
                 ValueReconstructResult::Missing => {
                     return Err(PageReconstructError::MissingKey(MissingKeyError {
-                        stuck_at_lsn: false,
                         key,
                         shard: self.shard_identity.get_shard_number(&key),
                         cont_lsn,
@@ -3215,7 +3199,6 @@ impl Timeline {
                         reconstruct_state.on_key_error(
                             key,
                             PageReconstructError::MissingKey(MissingKeyError {
-                                stuck_at_lsn: false,
                                 key,
                                 shard: self.shard_identity.get_shard_number(&key),
                                 cont_lsn,
@@ -3248,7 +3231,17 @@ impl Timeline {
         }
 
         if keyspace.total_raw_size() != 0 {
-            return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
+            return Err(GetVectoredError::MissingKey(MissingKeyError {
+                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                shard: self
+                    .shard_identity
+                    .get_shard_number(&keyspace.start().unwrap()),
+                cont_lsn,
+                request_lsn,
+                ancestor_lsn: Some(timeline.ancestor_lsn),
+                traversal_path: vec![],
+                backtrace: None,
+            }));
         }
 
         Ok(())
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 37676ab0d4..5c99ca6733 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -110,10 +110,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
-    if neon_env_builder.pageserver_get_impl == "vectored":
-        key_not_found_error = r".*Requested key.*not found,*"
-    else:
-        key_not_found_error = r".*could not find data for key.*"
+    key_not_found_error = r".*could not find data for key.*"
 
     env = neon_env_builder.init_start()
 

From ab95942fc25fa1c6bfd6f3041f16a868e8d86dcf Mon Sep 17 00:00:00 2001
From: Matt Podraza <19386552+mattpodraza@users.noreply.github.com>
Date: Thu, 2 May 2024 17:19:51 +0200
Subject: [PATCH 131/157] storage controller: make the initial database wait
 configurable (#7591)

This allows passing a humantime string in the CLI to configure the
initial wait for the database.
It defaults to the previously hard-coded value of 5 seconds.
---
 storage_controller/src/main.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index d84803733a..f1454af533 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,7 +5,6 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::sync::Arc;
-use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -70,6 +69,10 @@ struct Cli {
     /// Maximum number of reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
+
+    /// How long to wait for the initial database connection to be available.
+    #[arg(long, default_value = "5s")]
+    db_connect_timeout: humantime::Duration,
 }
 
 enum StrictMode {
@@ -255,7 +258,7 @@ async fn async_main() -> anyhow::Result<()> {
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
-    Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
+    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
     migration_run(&secrets.database_url)
         .await

From 4b55dad813a2dd23d4e653e656ecdc53068d5ef0 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 2 May 2024 12:43:36 -0400
Subject: [PATCH 132/157] vm-image: add sqlexporter for autoscaling metrics
 (#7514)

As discussed in https://github.com/neondatabase/autoscaling/pull/895, we
want to have a separate sql_exporter for simple metrics to avoid
overload the database because the autoscaling agent needs to scrape at a
higher interval. The new exporter is exposed at port 9499. I didn't do
any testing for this pull request but given it's just a configuration
change I assume this works.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 vm-image-spec.yaml | 96 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 061ff38722..3ccdf5cc64 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -17,6 +17,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -88,6 +92,41 @@ files:
       # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
       collector_files:
         - "neon_collector.yml"
+  - filename: sql_exporter_autoscaling.yml
+    content: |
+      # Configuration for sql_exporter for autoscaling-agent
+      # Global defaults.
+      global:
+        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+        scrape_timeout: 10s
+        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+        scrape_timeout_offset: 500ms
+        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+        min_interval: 0s
+        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+        # as will concurrent scrapes.
+        max_connections: 1
+        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+        # always be the same as max_connections.
+        max_idle_connections: 1
+        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+        # If 0, connections are not closed due to a connection's age.
+        max_connection_lifetime: 5m
+
+      # The target to monitor and the collectors to execute on it.
+      target:
+        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+        # the schema gets dropped or replaced to match the driver expected DSN format.
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
+
+        # Collectors (referenced by name) to execute on the target.
+        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+        collectors: [neon_collector_autoscaling]
+
+      # Collector files specifies a list of globs. One collector definition is read from each matching file.
+      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+      collector_files:
+        - "neon_collector_autoscaling.yml"
   - filename: neon_collector.yml
     content: |
       collector_name: neon_collector
@@ -194,6 +233,57 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
+  - filename: neon_collector_autoscaling.yml
+    content: |
+      collector_name: neon_collector_autoscaling
+      metrics:
+      - metric_name: lfc_misses
+        type: gauge
+        help: 'lfc_misses'
+        key_labels:
+        values: [lfc_misses]
+        query: |
+          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+      - metric_name: lfc_used
+        type: gauge
+        help: 'LFC chunks used (chunk = 1MB)'
+        key_labels:
+        values: [lfc_used]
+        query: |
+          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+      - metric_name: lfc_hits
+        type: gauge
+        help: 'lfc_hits'
+        key_labels:
+        values: [lfc_hits]
+        query: |
+          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+      - metric_name: lfc_writes
+        type: gauge
+        help: 'lfc_writes'
+        key_labels:
+        values: [lfc_writes]
+        query: |
+          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+      - metric_name: lfc_cache_size_limit
+        type: gauge
+        help: 'LFC cache size limit in bytes'
+        key_labels:
+        values: [lfc_cache_size_limit]
+        query: |
+          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+      - metric_name: lfc_approximate_working_set_size
+        type: gauge
+        help: 'Approximate working set size in pages of 8192 bytes'
+        key_labels:
+        values: [approximate_working_set_size]
+        query: |
+          select neon.approximate_working_set_size(false) as approximate_working_set_size;
 
 build: |
   # Build cgroup-tools
@@ -267,13 +357,17 @@ merge: |
   COPY pgbouncer.ini /etc/pgbouncer.ini
   COPY sql_exporter.yml /etc/sql_exporter.yml
   COPY neon_collector.yml /etc/neon_collector.yml
+  COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
+  COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
   RUN set -e \
       && chown postgres:postgres /etc/pgbouncer.ini \
       && chmod 0666 /etc/pgbouncer.ini \
       && chmod 0644 /etc/cgconfig.conf \
       && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml
+      && chmod 0644 /etc/neon_collector.yml \
+      && chmod 0644 /etc/sql_exporter_autoscaling.yml \
+      && chmod 0644 /etc/neon_collector_autoscaling.yml
 
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/

From 45ec8688ea27cbad9789aac934a23069cbe95595 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 2 May 2024 18:58:10 +0200
Subject: [PATCH 133/157] chore(pageserver): plumb through RequestContext to
 VirtualFile write methods (#7566)

This PR introduces no functional changes.

The read path will be done separately.

refs https://github.com/neondatabase/neon/issues/6107
refs https://github.com/neondatabase/neon/issues/7386
---
 pageserver/src/task_mgr.rs                    |  2 +
 pageserver/src/tenant/blob_io.rs              | 31 +++++----
 pageserver/src/tenant/ephemeral_file.rs       |  8 +--
 .../src/tenant/ephemeral_file/page_caching.rs | 11 +++-
 .../ephemeral_file/zero_padded_read_write.rs  |  9 ++-
 .../src/tenant/remote_timeline_client.rs      |  3 +
 .../tenant/remote_timeline_client/download.rs | 10 ++-
 pageserver/src/tenant/secondary.rs            |  8 ++-
 pageserver/src/tenant/secondary/downloader.rs | 18 +++--
 .../src/tenant/storage_layer/delta_layer.rs   | 64 +++++++++++++-----
 .../src/tenant/storage_layer/image_layer.rs   | 33 +++++++---
 .../tenant/storage_layer/inmemory_layer.rs    |  4 +-
 pageserver/src/tenant/storage_layer/layer.rs  | 25 +++++--
 pageserver/src/tenant/timeline.rs             |  4 +-
 pageserver/src/tenant/timeline/compaction.rs  | 18 +++--
 pageserver/src/virtual_file.rs                | 32 ++++++---
 .../util/size_tracking_writer.rs              |  5 +-
 .../virtual_file/owned_buffers_io/write.rs    | 66 ++++++++++++-------
 18 files changed, 246 insertions(+), 105 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index b76105399b..0c245580ee 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -363,6 +363,8 @@ pub enum TaskKind {
 
     EphemeralFilePreWarmPageCache,
 
+    LayerDownload,
+
     #[cfg(test)]
     UnitTest,
 }
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 6e90b3e8ff..1dc451f5c9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -130,8 +130,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
-        let (src_buf, res) = self.inner.write_all(src_buf).await;
+        let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
         let nbytes = match res {
             Ok(nbytes) => nbytes,
             Err(e) => return (src_buf, Err(e)),
@@ -142,9 +143,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     #[inline(always)]
     /// Flushes the internal buffer to the underlying `VirtualFile`.
-    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
+    pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
         let buf = std::mem::take(&mut self.buf);
-        let (mut buf, res) = self.inner.write_all(buf).await;
+        let (mut buf, res) = self.inner.write_all(buf, ctx).await;
         res?;
         buf.clear();
         self.buf = buf;
@@ -165,10 +166,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            return self.write_all_unbuffered(src_buf).await;
+            return self.write_all_unbuffered(src_buf, ctx).await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
         let src_buf_len = src_buf.bytes_init();
@@ -183,7 +185,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         }
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
-            if let Err(e) = self.flush_buffer().await {
+            if let Err(e) = self.flush_buffer(ctx).await {
                 return (Slice::into_inner(src_buf), Err(e));
             }
         }
@@ -199,7 +201,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 assert_eq!(copied, src_buf.len());
                 Slice::into_inner(src_buf)
             } else {
-                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
                 if let Err(e) = res {
                     return (src_buf, Err(e));
                 }
@@ -216,6 +218,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         srcbuf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -227,7 +230,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                self.write_all(io_buf).await
+                self.write_all(io_buf, ctx).await
             } else {
                 // Write a 4-byte length header
                 if len > 0x7fff_ffff {
@@ -242,7 +245,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 let mut len_buf = (len as u32).to_be_bytes();
                 len_buf[0] |= 0x80;
                 io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf).await
+                self.write_all(io_buf, ctx).await
             }
         }
         .await;
@@ -251,7 +254,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             Ok(_) => (),
             Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
         }
-        let (srcbuf, res) = self.write_all(srcbuf).await;
+        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
         (srcbuf, res.map(|_| offset))
     }
 }
@@ -261,8 +264,8 @@ impl BlobWriter<true> {
     ///
     /// This function flushes the internal buffer before giving access
     /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
-        self.flush_buffer().await?;
+    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
+        self.flush_buffer(ctx).await?;
         Ok(self.inner)
     }
 
@@ -299,16 +302,16 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path()).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone()).await;
+                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer().await?;
+            wtr.flush_buffer(&ctx).await?;
         }
 
         let file = VirtualFile::open(pathbuf.as_path()).await?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 96efd13c1b..8b815a1885 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -74,7 +74,7 @@ impl EphemeralFile {
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
     ) -> Result<u64, io::Error> {
         let pos = self.rw.bytes_written();
 
@@ -83,15 +83,15 @@ impl EphemeralFile {
             // short one-byte length header
             let len_buf = [srcbuf.len() as u8];
 
-            self.rw.write_all_borrowed(&len_buf).await?;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
         } else {
             let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
             len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf).await?;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
         }
 
         // Write the payload
-        self.rw.write_all_borrowed(srcbuf).await?;
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
 
         Ok(pos)
     }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 934400e5be..42def8858e 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -35,10 +35,14 @@ impl RW {
         self.page_cache_file_id
     }
 
-    pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
+    pub(crate) async fn write_all_borrowed(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<usize, io::Error> {
         // It doesn't make sense to proactively fill the page cache on the Pageserver write path
         // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf).await
+        self.rw.write_all_borrowed(srcbuf, ctx).await
     }
 
     pub(crate) fn bytes_written(&self) -> u64 {
@@ -134,6 +138,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
     >(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
         let buf = buf.slice(..);
         let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
@@ -150,7 +155,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
         );
 
         // Do the IO.
-        let iobuf = match self.file.write_all(buf).await {
+        let iobuf = match self.file.write_all(buf, ctx).await {
             (iobuf, Ok(nwritten)) => {
                 assert_eq!(nwritten, buflen);
                 iobuf
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index 4159b5820a..b37eafb52c 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -20,6 +20,7 @@
 mod zero_padded;
 
 use crate::{
+    context::RequestContext,
     page_cache::PAGE_SZ,
     virtual_file::owned_buffers_io::{
         self,
@@ -60,8 +61,12 @@ where
         self.buffered_writer.as_inner().as_inner()
     }
 
-    pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf).await
+    pub async fn write_all_borrowed(
+        &mut self,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
     }
 
     pub fn bytes_written(&self) -> u64 {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c0767345ca..a54e93c96b 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,6 +210,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
+use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
@@ -505,6 +506,7 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
         cancel: &CancellationToken,
+        ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -522,6 +524,7 @@ impl RemoteTimelineClient {
                 layer_file_name,
                 layer_metadata,
                 cancel,
+                ctx,
             )
             .measure_remote_op(
                 RemoteOpFileKind::Layer,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 250354ac20..345a12aa86 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,6 +18,7 @@ use tracing::warn;
 use utils::backoff;
 
 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
@@ -40,6 +41,7 @@ use super::{
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
 ///
 /// Returns the size of the downloaded file.
+#[allow(clippy::too_many_arguments)]
 pub async fn download_layer_file<'a>(
     conf: &'static PageServerConf,
     storage: &'a GenericRemoteStorage,
@@ -48,6 +50,7 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
     cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -75,7 +78,7 @@ pub async fn download_layer_file<'a>(
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
         &format!("download {remote_path:?}"),
         cancel,
     )
@@ -133,6 +136,7 @@ async fn download_object<'a>(
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
     cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     let res = match crate::virtual_file::io_engine::get() {
         crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
@@ -208,10 +212,10 @@ async fn download_object<'a>(
                             Err(e) => return Err(e),
                         };
                         buffered
-                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
+                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
                             .await?;
                     }
-                    let size_tracking = buffered.flush_and_into_inner().await?;
+                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
                     Ok(size_tracking.into_inner())
                 }
                 .await?;
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 19f36c722e..5c46df268a 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime};
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     virtual_file::MaybeFatalIo,
@@ -316,9 +317,13 @@ pub fn spawn_tasks(
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
+    let downloader_task_ctx = RequestContext::new(
+        TaskKind::SecondaryDownloads,
+        crate::context::DownloadBehavior::Download,
+    );
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
+        downloader_task_ctx.task_kind(),
         None,
         None,
         "secondary tenant downloads",
@@ -330,6 +335,7 @@ pub fn spawn_tasks(
                 download_req_rx,
                 bg_jobs_clone,
                 cancel_clone,
+                downloader_task_ctx,
             )
             .await;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 67f866cb7b..8a987b5ade 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -8,6 +8,7 @@ use std::{
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     disk_usage_eviction_task::{
         finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
     },
@@ -74,12 +75,14 @@ pub(super) async fn downloader_task(
     command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
+    root_ctx: RequestContext,
 ) {
     let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
+        root_ctx,
     };
     let mut scheduler = Scheduler::new(generator, concurrency);
 
@@ -92,6 +95,7 @@ pub(super) async fn downloader_task(
 struct SecondaryDownloader {
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
+    root_ctx: RequestContext,
 }
 
 #[derive(Debug, Clone)]
@@ -367,11 +371,12 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         let remote_storage = self.remote_storage.clone();
         let conf = self.tenant_manager.get_conf();
         let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        let download_ctx = self.root_ctx.attached_child();
         (RunningDownload { barrier }, Box::pin(async move {
             let _completion = completion;
 
             match TenantDownloader::new(conf, &remote_storage, &secondary_state)
-                .download()
+                .download(&download_ctx)
                 .await
             {
                 Err(UpdateError::NoData) => {
@@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> {
         }
     }
 
-    async fn download(&self) -> Result<(), UpdateError> {
+    async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_id();
 
         // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
@@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline)
+            self.download_timeline(timeline, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -742,7 +747,11 @@ impl<'a> TenantDownloader<'a> {
         .and_then(|x| x)
     }
 
-    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+    async fn download_timeline(
+        &self,
+        timeline: HeatMapTimeline,
+        ctx: &RequestContext,
+    ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
         let timeline_path = self
@@ -875,6 +884,7 @@ impl<'a> TenantDownloader<'a> {
                 &layer.name,
                 &LayerFileMetadata::from(&layer.metadata),
                 &self.secondary_state.cancel,
+                ctx,
             )
             .await
             {
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index a9f8404158..b5538dff3a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -428,9 +428,15 @@ impl DeltaLayerWriterInner {
     ///
     /// The values must be appended in key, lsn order.
     ///
-    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let (_, res) = self
-            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
+            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
             .await;
         res
     }
@@ -441,9 +447,10 @@ impl DeltaLayerWriterInner {
         lsn: Lsn,
         val: Vec<u8>,
         will_init: bool,
+        ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val).await;
+        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
         let off = match res {
             Ok(off) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -463,18 +470,23 @@ impl DeltaLayerWriterInner {
     ///
     /// Finish writing the delta layer.
     ///
-    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(
+        self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
-        let mut file = self.blob_writer.into_inner().await?;
+        let mut file = self.blob_writer.into_inner(ctx).await?;
 
         // Write out the index
         let (index_root_blk, block_buf) = self.tree.finish()?;
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf).await;
+            let (_buf, res) = file.write_all(buf, ctx).await;
             res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
@@ -494,7 +506,7 @@ impl DeltaLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
 
         let metadata = file
@@ -592,8 +604,18 @@ impl DeltaLayerWriter {
     ///
     /// The values must be appended in key, lsn order.
     ///
-    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value(key, lsn, val, ctx)
+            .await
     }
 
     pub async fn put_value_bytes(
@@ -602,11 +624,12 @@ impl DeltaLayerWriter {
         lsn: Lsn,
         val: Vec<u8>,
         will_init: bool,
+        ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         self.inner
             .as_mut()
             .unwrap()
-            .put_value_bytes(key, lsn, val, will_init)
+            .put_value_bytes(key, lsn, val, will_init, ctx)
             .await
     }
 
@@ -621,10 +644,11 @@ impl DeltaLayerWriter {
         mut self,
         key_end: Key,
         timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
         let inner = self.inner.take().unwrap();
         let temp_path = inner.path.clone();
-        let result = inner.finish(key_end, timeline).await;
+        let result = inner.finish(key_end, timeline, ctx).await;
         // The delta layer files can sometimes be really large. Clean them up.
         if result.is_err() {
             tracing::warn!(
@@ -692,7 +716,7 @@ impl DeltaLayer {
         // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
         Ok(())
     }
@@ -1281,7 +1305,13 @@ impl DeltaLayerInner {
                     per_blob_copy.extend_from_slice(data);
 
                     let (tmp, res) = writer
-                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
+                        .put_value_bytes(
+                            key,
+                            lsn,
+                            std::mem::take(&mut per_blob_copy),
+                            will_init,
+                            ctx,
+                        )
                         .await;
                     per_blob_copy = tmp;
                     res?;
@@ -1760,12 +1790,14 @@ mod test {
 
         for entry in entries {
             let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
+                .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
                 .await;
             res?;
         }
 
-        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;
 
         let inner = resident.as_delta(&ctx).await?;
 
@@ -1951,7 +1983,7 @@ mod test {
                 .await
                 .unwrap();
 
-            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
 
             copied_layer.as_delta(ctx).await.unwrap();
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 6f46a0203b..1477a1fc33 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -357,7 +357,7 @@ impl ImageLayer {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
         Ok(())
     }
@@ -677,9 +677,14 @@ impl ImageLayerWriterInner {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
+    async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img).await;
+        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
         // TODO: re-use the buffer for `img` further upstack
         let off = res?;
 
@@ -693,7 +698,11 @@ impl ImageLayerWriterInner {
     ///
     /// Finish writing the image layer.
     ///
-    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(
+        self,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -704,7 +713,7 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf).await;
+            let (_buf, res) = file.write_all(buf, ctx).await;
             res?;
         }
 
@@ -724,7 +733,7 @@ impl ImageLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
 
         let metadata = file
@@ -806,8 +815,13 @@ impl ImageLayerWriter {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_image(key, img).await
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
     ///
@@ -816,8 +830,9 @@ impl ImageLayerWriter {
     pub(crate) async fn finish(
         mut self,
         timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline).await
+        self.inner.take().unwrap().finish(timeline, ctx).await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index a2ae8ec29d..4dacbec2f3 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -659,14 +659,14 @@ impl InMemoryLayer {
                 let will_init = Value::des(&buf)?.will_init();
                 let res;
                 (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init)
+                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
                     .await;
                 res?;
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
         Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2b6934fcee..ebc0cbf9a4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -14,9 +14,10 @@ use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
 
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
@@ -939,11 +940,20 @@ impl LayerInner {
             return Err(DownloadError::DownloadRequired);
         }
 
+        let download_ctx = ctx
+            .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
+            .unwrap_or(RequestContext::new(
+                TaskKind::LayerDownload,
+                DownloadBehavior::Download,
+            ));
+
         async move {
             tracing::info!(%reason, "downloading on-demand");
 
             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-            let res = self.download_init_and_wait(timeline, permit).await?;
+            let res = self
+                .download_init_and_wait(timeline, permit, download_ctx)
+                .await?;
             scopeguard::ScopeGuard::into_inner(init_cancelled);
             Ok(res)
         }
@@ -982,6 +992,7 @@ impl LayerInner {
         self: &Arc<Self>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
+        ctx: RequestContext,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -1011,7 +1022,7 @@ impl LayerInner {
                     .await
                     .unwrap();
 
-                let res = this.download_and_init(timeline, permit).await;
+                let res = this.download_and_init(timeline, permit, &ctx).await;
 
                 if let Err(res) = tx.send(res) {
                     match res {
@@ -1054,6 +1065,7 @@ impl LayerInner {
         self: &Arc<LayerInner>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<DownloadedLayer>> {
         let client = timeline
             .remote_client
@@ -1061,7 +1073,12 @@ impl LayerInner {
             .expect("checked before download_init_and_wait");
 
         let result = client
-            .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
+            .download_layer_file(
+                &self.desc.filename(),
+                &self.metadata(),
+                &timeline.cancel,
+                ctx,
+            )
             .await;
 
         match result {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3c0a300a9a..22bfa53445 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4179,7 +4179,7 @@ impl Timeline {
                             };
 
                             // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img).await?;
+                            image_layer_writer.put_image(img_key, img, ctx).await?;
                             wrote_keys = true;
                         }
                     }
@@ -4190,7 +4190,7 @@ impl Timeline {
                 // Normal path: we have written some data into the new image layer for this
                 // partition, so flush it to disk.
                 start = img_range.end;
-                let image_layer = image_layer_writer.finish(self).await?;
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
                 image_layers.push(image_layer);
             } else {
                 // Special case: the image layer may be empty if this is a sharded tenant and the
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6ea37bf793..1088101a13 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -520,7 +520,7 @@ impl Timeline {
                             writer
                                 .take()
                                 .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
+                                .finish(prev_key.unwrap().next(), self, ctx)
                                 .await?,
                         );
                         writer = None;
@@ -562,7 +562,11 @@ impl Timeline {
                     );
                 }
 
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+                writer
+                    .as_mut()
+                    .unwrap()
+                    .put_value(key, lsn, value, ctx)
+                    .await?;
             } else {
                 debug!(
                     "Dropping key {} during compaction (it belongs on shard {:?})",
@@ -578,7 +582,7 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
         }
 
         // Sync layers
@@ -972,7 +976,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
 
             let value = val.load(ctx).await?;
 
-            writer.put_value(key, lsn, value).await?;
+            writer.put_value(key, lsn, value, ctx).await?;
 
             prev = Some((key, lsn));
         }
@@ -988,7 +992,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         });
 
         let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline)
+            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
             .await?;
 
         self.new_deltas.push(new_delta_layer);
@@ -1058,11 +1062,11 @@ impl TimelineAdaptor {
                         }
                     }
                 };
-                image_layer_writer.put_image(key, img).await?;
+                image_layer_writer.put_image(key, img, ctx).await?;
                 key = key.next();
             }
         }
-        let image_layer = image_layer_writer.finish(&self.timeline).await?;
+        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
 
         self.new_images.push(image_layer);
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6127b35079..a17488a286 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,6 +10,7 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
+use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
 use crate::page_cache::PageWriteGuard;
@@ -615,6 +616,7 @@ impl VirtualFile {
         &self,
         buf: B,
         mut offset: u64,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
         let buf_len = buf.bytes_init();
         if buf_len == 0 {
@@ -623,7 +625,7 @@ impl VirtualFile {
         let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
             let res;
-            (buf, res) = self.write_at(buf, offset).await;
+            (buf, res) = self.write_at(buf, offset, ctx).await;
             match res {
                 Ok(0) => {
                     return (
@@ -652,6 +654,7 @@ impl VirtualFile {
     pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<usize, Error>) {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
@@ -660,7 +663,7 @@ impl VirtualFile {
         let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
             let res;
-            (buf, res) = self.write(buf).await;
+            (buf, res) = self.write(buf, ctx).await;
             match res {
                 Ok(0) => {
                     return (
@@ -684,9 +687,10 @@ impl VirtualFile {
     async fn write<B: IoBuf + Send>(
         &mut self,
         buf: Slice<B>,
+        ctx: &RequestContext,
     ) -> (Slice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
-        let (buf, res) = self.write_at(buf, pos).await;
+        let (buf, res) = self.write_at(buf, pos, ctx).await;
         let n = match res {
             Ok(n) => n,
             Err(e) => return (buf, Err(e)),
@@ -724,6 +728,7 @@ impl VirtualFile {
         &self,
         buf: Slice<B>,
         offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     ) -> (Slice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
@@ -1088,8 +1093,9 @@ impl OwnedAsyncWriter for VirtualFile {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = VirtualFile::write_all(self, buf).await;
+        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
         res.map(move |v| (v, buf))
     }
 }
@@ -1146,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles {
 
 #[cfg(test)]
 mod tests {
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+
     use super::*;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
@@ -1177,10 +1186,11 @@ mod tests {
             &self,
             buf: B,
             offset: u64,
+            ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
                     res
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1201,10 +1211,11 @@ mod tests {
         async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
             &mut self,
             buf: B,
+            ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all(buf).await;
+                    let (_buf, res) = file.write_all(buf, ctx).await;
                     res.map(|_| ())
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1275,6 +1286,7 @@ mod tests {
         OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
         FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
     {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
@@ -1288,7 +1300,7 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_a.write_all(b"foobar".to_vec()).await?;
+        file_a.write_all(b"foobar".to_vec(), &ctx).await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string().await.unwrap_err();
@@ -1297,7 +1309,7 @@ mod tests {
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
+        let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string().await?);
@@ -1339,8 +1351,8 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
-        file_b.write_all_at(b"FOO".to_vec(), 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
 
         assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index c2817699c3..55b1d0b46b 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,4 +1,4 @@
-use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
+use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
 use tokio_epoll_uring::{BoundedBuf, IoBuf};
 
 pub struct Writer<W> {
@@ -38,8 +38,9 @@ where
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (nwritten, buf) = self.dst.write_all(buf).await?;
+        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 738a642332..ac5169508f 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,12 +1,15 @@
 use bytes::BytesMut;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
+use crate::context::RequestContext;
+
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
 pub trait OwnedAsyncWriter {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)>;
 }
 
@@ -57,8 +60,9 @@ where
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
-        self.flush().await?;
+    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+        self.flush(ctx).await?;
+
         let Self { buf, writer } = self;
         assert!(buf.is_some());
         Ok(writer)
@@ -72,14 +76,18 @@ where
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
+    pub async fn write_buffered<S: IoBuf>(
+        &mut self,
+        chunk: Slice<S>,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, S)>
     where
         S: IoBuf + Send,
     {
         let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
         if chunk.len() >= self.buf().cap() {
-            self.flush().await?;
+            self.flush(ctx).await?;
             // do a big write, bypassing `buf`
             assert_eq!(
                 self.buf
@@ -88,7 +96,7 @@ where
                     .pending(),
                 0
             );
-            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
+            let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
             assert_eq!(nwritten, chunk_len);
             return Ok((nwritten, chunk));
         }
@@ -104,7 +112,7 @@ where
             slice = &slice[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush().await?;
+                self.flush(ctx).await?;
             }
         }
         assert!(slice.is_empty(), "by now we should have drained the chunk");
@@ -116,7 +124,11 @@ where
     /// It is less performant because we always have to copy the borrowed data into the internal buffer
     /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
     /// for large writes.
-    pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
+    pub async fn write_buffered_borrowed(
+        &mut self,
+        mut chunk: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
         let chunk_len = chunk.len();
         while !chunk.is_empty() {
             let buf = self.buf.as_mut().expect("must not use after an error");
@@ -127,20 +139,20 @@ where
             chunk = &chunk[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush().await?;
+                self.flush(ctx).await?;
             }
         }
         Ok(chunk_len)
     }
 
-    async fn flush(&mut self) -> std::io::Result<()> {
+    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
         let buf_len = buf.pending();
         if buf_len == 0 {
             self.buf = Some(buf);
             return Ok(());
         }
-        let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
+        let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
         assert_eq!(nwritten, buf_len);
         self.buf = Some(Buffer::reuse_after_flush(io_buf));
         Ok(())
@@ -206,6 +218,7 @@ impl OwnedAsyncWriter for Vec<u8> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        _: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
@@ -222,6 +235,8 @@ mod tests {
     use bytes::BytesMut;
 
     use super::*;
+    use crate::context::{DownloadBehavior, RequestContext};
+    use crate::task_mgr::TaskKind;
 
     #[derive(Default)]
     struct RecorderWriter {
@@ -231,6 +246,7 @@ mod tests {
         async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
             &mut self,
             buf: B,
+            _: &RequestContext,
         ) -> std::io::Result<(usize, B::Buf)> {
             let nbytes = buf.bytes_init();
             if nbytes == 0 {
@@ -243,10 +259,14 @@ mod tests {
         }
     }
 
+    fn test_ctx() -> RequestContext {
+        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+    }
+
     macro_rules! write {
         ($writer:ident, $data:literal) => {{
             $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_full())
+                .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
                 .await?;
         }};
     }
@@ -260,7 +280,7 @@ mod tests {
         write!(writer, b"c");
         write!(writer, b"d");
         write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
@@ -276,7 +296,7 @@ mod tests {
         write!(writer, b"de");
         write!(writer, b"");
         write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
@@ -292,7 +312,7 @@ mod tests {
         write!(writer, b"bc");
         write!(writer, b"d");
         write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
@@ -302,18 +322,20 @@ mod tests {
 
     #[tokio::test]
     async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+        let ctx = test_ctx();
+        let ctx = &ctx;
         let recorder = RecorderWriter::default();
         let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
 
-        writer.write_buffered_borrowed(b"abc").await?;
-        writer.write_buffered_borrowed(b"d").await?;
-        writer.write_buffered_borrowed(b"e").await?;
-        writer.write_buffered_borrowed(b"fg").await?;
-        writer.write_buffered_borrowed(b"hi").await?;
-        writer.write_buffered_borrowed(b"j").await?;
-        writer.write_buffered_borrowed(b"klmno").await?;
+        writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"d", ctx).await?;
+        writer.write_buffered_borrowed(b"e", ctx).await?;
+        writer.write_buffered_borrowed(b"fg", ctx).await?;
+        writer.write_buffered_borrowed(b"hi", ctx).await?;
+        writer.write_buffered_borrowed(b"j", ctx).await?;
+        writer.write_buffered_borrowed(b"klmno", ctx).await?;
 
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(ctx).await?;
         assert_eq!(
             recorder.writes,
             {

From 7a49e5d5c21aeefcba4aa0a1135069fa6a4e8de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 2 May 2024 20:18:13 +0200
Subject: [PATCH 134/157] Remove tenant_id from TenantLocationConfigRequest
 (#7469)

Follow-up of #7055 and #7476 to remove `tenant_id` from
`TenantLocationConfigRequest` completely. All components of our system
should now not specify the `tenant_id`.

cc https://github.com/neondatabase/cloud/pull/11791
---
 Cargo.lock                              | 21 +++++++++++----------
 libs/pageserver_api/src/models.rs       |  2 --
 pageserver/client/src/mgmt_api.rs       |  5 +----
 pageserver/src/http/openapi_spec.yml    |  3 ---
 test_runner/fixtures/pageserver/http.py |  1 -
 5 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 775a0d977d..1db41cd755 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -867,9 +867,9 @@ checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
 [[package]]
 name = "base64"
-version = "0.22.0"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "base64-simd"
@@ -4769,7 +4769,7 @@ version = "0.12.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
 dependencies = [
- "base64 0.22.0",
+ "base64 0.22.1",
  "bytes",
  "futures-channel",
  "futures-core",
@@ -5927,7 +5927,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
 
 [[package]]
 name = "syn"
@@ -6508,10 +6508,11 @@ dependencies = [
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
+ "cfg-if",
  "log",
  "pin-project-lite",
  "tracing-attributes",
@@ -6531,9 +6532,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6542,9 +6543,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
  "once_cell",
  "valuable",
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c752799c4c..a54cdb520d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -430,8 +430,6 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 012cb1a662..bc66c5c6e1 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -291,10 +291,7 @@ impl Client {
         flush_ms: Option<std::time::Duration>,
         lazy: bool,
     ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest {
-            tenant_id: None,
-            config,
-        };
+        let req_body = TenantLocationConfigRequest { config };
 
         let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/location_config",
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index d89f949688..c425f3e628 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -782,9 +782,6 @@ components:
       required:
         - mode
       properties:
-        tenant_id:
-          type: string
-          description: Not used, scheduled for removal.
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b899b0dac8..231ffd898e 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -293,7 +293,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         lazy: Optional[bool] = None,
     ):
         body = location_conf.copy()
-        body["tenant_id"] = str(tenant_id)
 
         params = {}
         if flush_ms is not None:

From 5f099dc7603d0b41418ad9b5e7267e377f24534c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 2 May 2024 20:19:00 +0200
Subject: [PATCH 135/157] Use streaming downloads for Azure as well (#7579)

The main challenge was in the second commit, as `DownloadStream`
requires the inner to be Sync but the stream returned by the Azure SDK
wasn't Sync.

This left us with three options:

* Change the Azure SDK to return Sync streams. This was abandoned after
we realized that we couldn't just make `TokenCredential`'s returned
future Sync: it uses the `async_trait` macro and as the
`TokenCredential` trait is used in dyn form, one can't use Rust's new
"async fn in Trait" feature.
* Change `DownloadStream` to not require `Sync`. This was abandoned
after it turned into a safekeeper refactoring project.
* Put the stream into a `Mutex` and make it obtain a lock on every poll.
This adds some performance overhead but locks that actually don't do
anything should be comparatively cheap.

We went with the third option in the end as the change still represents
an improvement.

Follow up of #5446 , fixes #5563
---
 Cargo.lock                            |  5 ++
 libs/remote_storage/Cargo.toml        |  1 +
 libs/remote_storage/src/azure_blob.rs | 66 ++++++++++++++++-----------
 workspace_hack/Cargo.toml             |  1 +
 4 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1db41cd755..438b68493b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4714,6 +4714,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sync_wrapper",
  "test-context",
  "tokio",
  "tokio-stream",
@@ -5956,6 +5957,9 @@ name = "sync_wrapper"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+dependencies = [
+ "futures-core",
+]
 
 [[package]]
 name = "synstructure"
@@ -7505,6 +7509,7 @@ dependencies = [
  "subtle",
  "syn 1.0.109",
  "syn 2.0.52",
+ "sync_wrapper",
  "time",
  "time-macros",
  "tokio",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4a53f485ca..78da01c9a0 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
 itertools.workspace = true
+sync_wrapper = { workspace = true, features = ["futures"] }
 
 [dev-dependencies]
 camino-tempfile.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 5fff3e25c9..24c1248304 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,6 +3,7 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
+use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
@@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
+use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
@@ -128,12 +130,12 @@ impl AzureBlobStorage {
         let kind = RequestKind::Get;
 
         let _permit = self.permit(kind, cancel).await?;
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
 
         let mut etag = None;
         let mut last_modified = None;
         let mut metadata = HashMap::new();
-        // TODO give proper streaming response instead of buffering into RAM
-        // https://github.com/neondatabase/neon/issues/5563
 
         let download = async {
             let response = builder
@@ -152,39 +154,46 @@ impl AzureBlobStorage {
                 Err(_elapsed) => Err(DownloadError::Timeout),
             });
 
-            let mut response = std::pin::pin!(response);
+            let mut response = Box::pin(response);
 
-            let mut bufs = Vec::new();
-            while let Some(part) = response.next().await {
-                let part = part?;
-                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
-                }
-                if last_modified.is_none() {
-                    last_modified = Some(part.blob.properties.last_modified.into());
-                }
-                if let Some(blob_meta) = part.blob.metadata {
-                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-                }
-                let data = part
-                    .data
-                    .collect()
-                    .await
-                    .map_err(|e| DownloadError::Other(e.into()))?;
-                bufs.push(data);
-            }
-
-            if bufs.is_empty() {
+            let Some(part) = response.next().await else {
                 return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
+                    "Azure GET response contained no response body"
                 )));
+            };
+            let part = part?;
+            if etag.is_none() {
+                etag = Some(part.blob.properties.etag);
             }
+            if last_modified.is_none() {
+                last_modified = Some(part.blob.properties.last_modified.into());
+            }
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
+
             // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
             let etag = etag.unwrap();
             let last_modified = last_modified.unwrap();
 
+            let tail_stream = response
+                .map(|part| match part {
+                    Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
+                    Err(e) => {
+                        Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
+                    }
+                })
+                .flatten();
+            let stream = part
+                .data
+                .map(|r| r.map_err(io::Error::other))
+                .chain(sync_wrapper::SyncStream::new(tail_stream));
+            //.chain(SyncStream::from_pin(Box::pin(tail_stream)));
+
+            let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
+
             Ok(Download {
-                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                download_stream: Box::pin(download_stream),
                 etag,
                 last_modified,
                 metadata: Some(StorageMetadata(metadata)),
@@ -193,7 +202,10 @@ impl AzureBlobStorage {
 
         tokio::select! {
             bufs = download => bufs,
-            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
+                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+            },
         }
     }
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a225984688..b2da33e44a 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -66,6 +66,7 @@ serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
+sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }

From 240efb82f918166a4b596c698f701f14a76d18f8 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Fri, 3 May 2024 10:00:29 +0200
Subject: [PATCH 136/157] Proxy reconnect pubsub before expiration (#7562)

## Problem

Proxy reconnects to redis only after it's already unavailable.

## Summary of changes

Reconnects every 6h.
---
 proxy/src/cache/project_info.rs  | 42 ++++++++++-----
 proxy/src/redis/notifications.rs | 93 ++++++++++++++++++++++----------
 2 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index d8a1d261ce..10cc4ceee1 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,9 +5,11 @@ use std::{
     time::Duration,
 };
 
+use async_trait::async_trait;
 use dashmap::DashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
+use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -21,11 +23,12 @@ use crate::{
 
 use super::{Cache, Cached};
 
+#[async_trait]
 pub trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
-    fn enable_ttl(&self);
-    fn disable_ttl(&self);
+    async fn decrement_active_listeners(&self);
+    async fn increment_active_listeners(&self);
 }
 
 struct Entry<T> {
@@ -116,8 +119,10 @@ pub struct ProjectInfoCacheImpl {
 
     start_time: Instant,
     ttl_disabled_since_us: AtomicU64,
+    active_listeners_lock: Mutex<usize>,
 }
 
+#[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
@@ -148,15 +153,27 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn enable_ttl(&self) {
-        self.ttl_disabled_since_us
-            .store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
+    async fn decrement_active_listeners(&self) {
+        let mut listeners_guard = self.active_listeners_lock.lock().await;
+        if *listeners_guard == 0 {
+            tracing::error!("active_listeners count is already 0, something is broken");
+            return;
+        }
+        *listeners_guard -= 1;
+        if *listeners_guard == 0 {
+            self.ttl_disabled_since_us
+                .store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
+        }
     }
 
-    fn disable_ttl(&self) {
-        let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
-        self.ttl_disabled_since_us
-            .store(new_ttl, std::sync::atomic::Ordering::Relaxed);
+    async fn increment_active_listeners(&self) {
+        let mut listeners_guard = self.active_listeners_lock.lock().await;
+        *listeners_guard += 1;
+        if *listeners_guard == 1 {
+            let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
+            self.ttl_disabled_since_us
+                .store(new_ttl, std::sync::atomic::Ordering::SeqCst);
+        }
     }
 }
 
@@ -168,6 +185,7 @@ impl ProjectInfoCacheImpl {
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
+            active_listeners_lock: Mutex::new(0),
         }
     }
 
@@ -432,7 +450,7 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         }));
-        cache.clone().disable_ttl();
+        cache.clone().increment_active_listeners().await;
         tokio::time::advance(Duration::from_secs(2)).await;
 
         let project_id: ProjectId = "project".into();
@@ -489,7 +507,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_disable_ttl_invalidate_added_before() {
+    async fn test_increment_active_listeners_invalidate_added_before() {
         tokio::time::pause();
         let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
             size: 2,
@@ -514,7 +532,7 @@ mod tests {
             (&user1).into(),
             secret1.clone(),
         );
-        cache.clone().disable_ttl();
+        cache.clone().increment_active_listeners().await;
         tokio::time::advance(Duration::from_millis(100)).await;
         cache.insert_role_secret(
             (&project_id).into(),
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index ba4dfb755e..87d723d17e 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -4,6 +4,7 @@ use futures::StreamExt;
 use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
@@ -77,6 +78,16 @@ struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     region_id: String,
 }
 
+impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
+    fn clone(&self) -> Self {
+        Self {
+            cache: self.cache.clone(),
+            cancellation_handler: self.cancellation_handler.clone(),
+            region_id: self.region_id.clone(),
+        }
+    }
+}
+
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
     pub fn new(
         cache: Arc<C>,
@@ -89,11 +100,11 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             region_id,
         }
     }
-    pub fn disable_ttl(&self) {
-        self.cache.disable_ttl();
+    pub async fn increment_active_listeners(&self) {
+        self.cache.increment_active_listeners().await;
     }
-    pub fn enable_ttl(&self) {
-        self.cache.enable_ttl();
+    pub async fn decrement_active_listeners(&self) {
+        self.cache.decrement_active_listeners().await;
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
@@ -182,37 +193,24 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     }
 }
 
-/// Handle console's invalidation messages.
-#[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(
+async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
+    handler: MessageHandler<C>,
     redis: ConnectionWithCredentialsProvider,
-    cache: Arc<C>,
-    cancel_map: CancelMap,
-    region_id: String,
-) -> anyhow::Result<Infallible>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    cache.enable_ttl();
-    let handler = MessageHandler::new(
-        cache,
-        Arc::new(CancellationHandler::<()>::new(
-            cancel_map,
-            crate::metrics::CancellationSource::FromRedis,
-        )),
-        region_id,
-    );
-
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
     loop {
+        if cancellation_token.is_cancelled() {
+            return Ok(());
+        }
         let mut conn = match try_connect(&redis).await {
             Ok(conn) => {
-                handler.disable_ttl();
+                handler.increment_active_listeners().await;
                 conn
             }
             Err(e) => {
                 tracing::error!(
-                    "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
-                );
+            "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
+        );
                 tokio::time::sleep(RECONNECT_TIMEOUT).await;
                 continue;
             }
@@ -226,8 +224,47 @@ where
                     break;
                 }
             }
+            if cancellation_token.is_cancelled() {
+                handler.decrement_active_listeners().await;
+                return Ok(());
+            }
         }
-        handler.enable_ttl();
+        handler.decrement_active_listeners().await;
+    }
+}
+
+/// Handle console's invalidation messages.
+#[tracing::instrument(name = "redis_notifications", skip_all)]
+pub async fn task_main<C>(
+    redis: ConnectionWithCredentialsProvider,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
+where
+    C: ProjectInfoCache + Send + Sync + 'static,
+{
+    let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+        cancel_map,
+        crate::metrics::CancellationSource::FromRedis,
+    ));
+    let handler = MessageHandler::new(cache, cancellation_handler, region_id);
+    // 6h - 1m.
+    // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
+    let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
+    loop {
+        let cancellation_token = CancellationToken::new();
+        interval.tick().await;
+
+        tokio::spawn(handle_messages(
+            handler.clone(),
+            redis.clone(),
+            cancellation_token.clone(),
+        ));
+        tokio::spawn(async move {
+            tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h.
+            cancellation_token.cancel();
+        });
     }
 }
 

From 00423152c6eeafb731eddc11453ea683dab6196f Mon Sep 17 00:00:00 2001
From: Jure Bajic <jure.bajic94@gmail.com>
Date: Fri, 3 May 2024 10:38:19 +0200
Subject: [PATCH 137/157] Store operation identifier in `IdLockMap` on
 exclusive lock (#7397)

## Problem

Issues around operation and tenant locks would have been hard to debug
since there was little observability around them.

## Summary of changes

- As suggested in the issue, a wrapper was added around
`OwnedRwLockWriteGuard` called `IdentifierLock` that removes the
operation currently holding the exclusive lock when it's dropped.
- The value in `IdLockMap` was extended to hold a pair of locks and
operations that can be accessed and locked independently.
- When requesting an exclusive lock besides returning the lock on that
resource, an operation is changed if the lock is acquired.


Closes https://github.com/neondatabase/neon/issues/7108
---
 Cargo.lock                                    |   2 +
 storage_controller/Cargo.toml                 |   2 +
 storage_controller/src/id_lock_map.rs         | 153 ++++++++++++++++--
 storage_controller/src/service.rs             | 123 ++++++++++----
 test_runner/fixtures/neon_fixtures.py         | 100 +++++++-----
 .../regress/test_storage_controller.py        |  48 ++++++
 6 files changed, 348 insertions(+), 80 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 438b68493b..8438dad41b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5856,6 +5856,8 @@ dependencies = [
  "routerify",
  "serde",
  "serde_json",
+ "strum",
+ "strum_macros",
  "thiserror",
  "tokio",
  "tokio-util",
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 789420f2b0..194619a496 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -40,6 +40,8 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
index b03700b50c..dff793289f 100644
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -1,25 +1,64 @@
+use std::fmt::Display;
+use std::time::Instant;
 use std::{collections::HashMap, sync::Arc};
 
+use std::time::Duration;
+
+use crate::service::RECONCILE_TIMEOUT;
+
+const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;
+
+/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
+/// current holding operation in lock.
+pub struct WrappedWriteGuard<T: Display> {
+    guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
+    start: Instant,
+}
+
+impl<T: Display> WrappedWriteGuard<T> {
+    pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
+        Self {
+            guard,
+            start: Instant::now(),
+        }
+    }
+}
+
+impl<T: Display> Drop for WrappedWriteGuard<T> {
+    fn drop(&mut self) {
+        let duration = self.start.elapsed();
+        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+            tracing::warn!(
+                "Lock on {} was held for {:?}",
+                self.guard.as_ref().unwrap(),
+                duration
+            );
+        }
+        *self.guard = None;
+    }
+}
+
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
 /// is needed at a tenant-wide granularity.
-pub(crate) struct IdLockMap<T>
+pub(crate) struct IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
 {
     /// A synchronous lock for getting/setting the async locks that our callers will wait on.
-    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<Option<I>>>>>,
 }
 
-impl<T> IdLockMap<T>
+impl<T, I> IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
+    I: Display,
 {
     pub(crate) fn shared(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
         let mut locked = self.entities.lock().unwrap();
         let entry = locked.entry(key).or_default();
         entry.clone().read_owned()
@@ -28,21 +67,26 @@ where
     pub(crate) fn exclusive(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        operation: I,
+    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
         let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default();
-        entry.clone().write_owned()
+        let entry = locked.entry(key).or_default().clone();
+        async move {
+            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
+            *guard.guard = Some(operation);
+            guard
+        }
     }
 
     /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
     /// periodic housekeeping to avoid the map growing indefinitely
     pub(crate) fn housekeeping(&self) {
         let mut locked = self.entities.lock().unwrap();
-        locked.retain(|_k, lock| lock.try_write().is_err())
+        locked.retain(|_k, entry| entry.try_write().is_err())
     }
 }
 
-impl<T> Default for IdLockMap<T>
+impl<T, I> Default for IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
 {
@@ -52,3 +96,94 @@ where
         }
     }
 }
+
+pub async fn trace_exclusive_lock<
+    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
+    I: Display + Clone,
+>(
+    op_locks: &IdLockMap<T, I>,
+    key: T,
+    operation: I,
+) -> WrappedWriteGuard<I> {
+    let start = Instant::now();
+    let guard = op_locks.exclusive(key.clone(), operation.clone()).await;
+
+    let duration = start.elapsed();
+    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+        tracing::warn!(
+            "Operation {} on key {} has waited {:?} for exclusive lock",
+            operation,
+            key,
+            duration
+        );
+    }
+
+    guard
+}
+
+pub async fn trace_shared_lock<
+    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
+    I: Display,
+>(
+    op_locks: &IdLockMap<T, I>,
+    key: T,
+    operation: I,
+) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
+    let start = Instant::now();
+    let guard = op_locks.shared(key.clone()).await;
+
+    let duration = start.elapsed();
+    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+        tracing::warn!(
+            "Operation {} on key {} has waited {:?} for shared lock",
+            operation,
+            key,
+            duration
+        );
+    }
+
+    guard
+}
+
+#[cfg(test)]
+mod tests {
+    use super::IdLockMap;
+
+    #[derive(Clone, Debug, strum_macros::Display, PartialEq)]
+    enum Operations {
+        Op1,
+        Op2,
+    }
+
+    #[tokio::test]
+    async fn multiple_shared_locks() {
+        let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();
+
+        let shared_lock_1 = id_lock_map.shared(1).await;
+        let shared_lock_2 = id_lock_map.shared(1).await;
+
+        assert!(shared_lock_1.is_none());
+        assert!(shared_lock_2.is_none());
+    }
+
+    #[tokio::test]
+    async fn exclusive_locks() {
+        let id_lock_map = IdLockMap::default();
+        let resource_id = 1;
+
+        {
+            let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await;
+            assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1);
+
+            let _ex_lock_2 = tokio::time::timeout(
+                tokio::time::Duration::from_millis(1),
+                id_lock_map.exclusive(resource_id, Operations::Op2),
+            )
+            .await;
+            assert!(_ex_lock_2.is_err());
+        }
+
+        let shared_lock_1 = id_lock_map.shared(resource_id).await;
+        assert!(shared_lock_1.is_none());
+    }
+}
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f26122e646..eaff87d1ce 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -9,7 +9,7 @@ use std::{
 
 use crate::{
     compute_hook::NotifyError,
-    id_lock_map::IdLockMap,
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{ScheduleContext, ScheduleMode},
@@ -33,6 +33,7 @@ use pageserver_api::{
     models::{SecondaryProgress, TenantConfigRequest},
 };
 use reqwest::StatusCode;
+use tracing::instrument;
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
@@ -50,11 +51,11 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
-use tokio::sync::{mpsc::error::TrySendError, OwnedRwLockWriteGuard};
+use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
 use utils::{
     completion::Barrier,
+    failpoint_support,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
@@ -79,7 +80,7 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
 // For operations that might be slow, like migrating a tenant with
 // some data in it.
-const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 // If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
 // tenant shards into this generation, and as long as it remains in this generation, we will accept
@@ -96,6 +97,26 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 /// (`<https://github.com/neondatabase/neon/issues/7552>`)
 pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
+#[derive(Clone, strum_macros::Display)]
+enum TenantOperations {
+    Create,
+    LocationConfig,
+    ConfigSet,
+    TimeTravelRemoteStorage,
+    Delete,
+    UpdatePolicy,
+    ShardSplit,
+    SecondaryDownload,
+    TimelineCreate,
+    TimelineDelete,
+}
+
+#[derive(Clone, strum_macros::Display)]
+enum NodeOperations {
+    Register,
+    Configure,
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
@@ -231,11 +252,11 @@ pub struct Service {
     // Locking on a tenant granularity (covers all shards in the tenant):
     // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split)
     // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD)
-    tenant_op_locks: IdLockMap<TenantId>,
+    tenant_op_locks: IdLockMap<TenantId, TenantOperations>,
 
     // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or
     // that transition it to/from Active.
-    node_op_locks: IdLockMap<NodeId>,
+    node_op_locks: IdLockMap<NodeId, NodeOperations>,
 
     // Limit how many Reconcilers we will spawn concurrently
     reconciler_concurrency: Arc<tokio::sync::Semaphore>,
@@ -307,7 +328,7 @@ struct TenantShardSplitAbort {
     new_shard_count: ShardCount,
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>,
+    _tenant_lock: WrappedWriteGuard<TenantOperations>,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1340,7 +1361,7 @@ impl Service {
     async fn node_activate_reconcile(
         &self,
         mut node: Node,
-        _lock: &OwnedRwLockWriteGuard<()>,
+        _lock: &WrappedWriteGuard<NodeOperations>,
     ) -> Result<(), ApiError> {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -1586,11 +1607,12 @@ impl Service {
         let tenant_id = create_req.new_tenant_id.tenant_id;
 
         // Exclude any concurrent attempts to create/access the same tenant ID
-        let _tenant_lock = self
-            .tenant_op_locks
-            .exclusive(create_req.new_tenant_id.tenant_id)
-            .await;
-
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            create_req.new_tenant_id.tenant_id,
+            TenantOperations::Create,
+        )
+        .await;
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
@@ -1929,10 +1951,12 @@ impl Service {
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
         // We require an exclusive lock, because we are updating both persistent and in-memory state
-        let _tenant_lock = self
-            .tenant_op_locks
-            .exclusive(tenant_shard_id.tenant_id)
-            .await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_shard_id.tenant_id,
+            TenantOperations::LocationConfig,
+        )
+        .await;
 
         if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2050,7 +2074,12 @@ impl Service {
 
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigSet,
+        )
+        .await;
 
         let tenant_id = req.tenant_id;
         let config = req.config;
@@ -2139,7 +2168,12 @@ impl Service {
         timestamp: Cow<'_, str>,
         done_if_after: Cow<'_, str>,
     ) -> Result<(), ApiError> {
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimeTravelRemoteStorage,
+        )
+        .await;
 
         let node = {
             let locked = self.inner.read().unwrap();
@@ -2230,7 +2264,12 @@ impl Service {
         tenant_id: TenantId,
         wait: Option<Duration>,
     ) -> Result<(StatusCode, SecondaryProgress), ApiError> {
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::SecondaryDownload,
+        )
+        .await;
 
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
         let targets = {
@@ -2324,7 +2363,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock =
+            trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2424,7 +2464,14 @@ impl Service {
         req: TenantPolicyRequest,
     ) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::UpdatePolicy,
+        )
+        .await;
+
+        failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock");
 
         let TenantPolicyRequest {
             placement,
@@ -2478,7 +2525,12 @@ impl Service {
             create_req.new_timeline_id,
         );
 
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineCreate,
+        )
+        .await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2593,7 +2645,12 @@ impl Service {
         timeline_id: TimelineId,
     ) -> Result<StatusCode, ApiError> {
         tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineDelete,
+        )
+        .await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -3132,7 +3189,12 @@ impl Service {
     ) -> Result<TenantShardSplitResponse, ApiError> {
         // TODO: return 503 if we get stuck waiting for this lock
         // (issue https://github.com/neondatabase/neon/issues/7108)
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::ShardSplit,
+        )
+        .await;
 
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
@@ -3893,9 +3955,13 @@ impl Service {
         &self,
         register_req: NodeRegisterRequest,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await;
+        let _node_lock = trace_exclusive_lock(
+            &self.node_op_locks,
+            register_req.node_id,
+            NodeOperations::Register,
+        )
+        .await;
 
-        // Pre-check for an already-existing node
         {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
@@ -3982,7 +4048,8 @@ impl Service {
         availability: Option<NodeAvailability>,
         scheduling: Option<NodeSchedulingPolicy>,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(node_id).await;
+        let _node_lock =
+            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await;
 
         if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fbd1e22aa9..19aa4cc886 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1959,6 +1959,55 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
+class LogUtils:
+    """
+    A mixin class which provides utilities for inspecting the logs of a service.
+    """
+
+    def __init__(self, logfile: Path) -> None:
+        self.logfile = logfile
+
+    def assert_log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Tuple[str, LogCursor]:
+        """Convenient for use inside wait_until()"""
+
+        res = self.log_contains(pattern, offset=offset)
+        assert res is not None
+        return res
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
+        """Check that the log contains a line that matches the given regex"""
+        logfile = self.logfile
+        if not logfile.exists():
+            log.warning(f"Skipping log check: {logfile} does not exist")
+            return None
+
+        contains_re = re.compile(pattern)
+
+        # XXX: Our rust logging machinery buffers the messages, so if you
+        # call this function immediately after it's been logged, there is
+        # no guarantee it is already present in the log file. This hasn't
+        # been a problem in practice, our python tests are not fast enough
+        # to hit that race condition.
+        skip_until_line_no = 0 if offset is None else offset._line_no
+        cur_line_no = 0
+        with logfile.open("r") as f:
+            for line in f:
+                if cur_line_no < skip_until_line_no:
+                    cur_line_no += 1
+                    continue
+                elif contains_re.search(line):
+                    # found it!
+                    cur_line_no += 1
+                    return (line, LogCursor(cur_line_no))
+                else:
+                    cur_line_no += 1
+        return None
+
+
 class StorageControllerApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
@@ -1966,12 +2015,13 @@ class StorageControllerApiException(Exception):
         self.status_code = status_code
 
 
-class NeonStorageController(MetricsGetter):
+class NeonStorageController(MetricsGetter, LogUtils):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
+        self.logfile = self.workdir / "storage_controller.log"
 
     def start(self):
         assert not self.running
@@ -2295,6 +2345,10 @@ class NeonStorageController(MetricsGetter):
         log.info(f"Got failpoints request response code {res.status_code}")
         res.raise_for_status()
 
+    @property
+    def workdir(self) -> Path:
+        return self.env.repo_dir
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
@@ -2312,7 +2366,7 @@ class LogCursor:
     _line_no: int
 
 
-class NeonPageserver(PgProtocol):
+class NeonPageserver(PgProtocol, LogUtils):
     """
     An object representing a running pageserver.
     """
@@ -2329,7 +2383,7 @@ class NeonPageserver(PgProtocol):
         self.service_port = port
         self.config_override = config_override
         self.version = env.get_binary_version("pageserver")
-
+        self.logfile = self.workdir / "pageserver.log"
         # After a test finishes, we will scrape the log to see if there are any
         # unexpected error messages. If your test expects an error, add it to
         # 'allowed_errors' in the test with something like:
@@ -2469,46 +2523,6 @@ class NeonPageserver(PgProtocol):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
-    def assert_log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Tuple[str, LogCursor]:
-        """Convenient for use inside wait_until()"""
-
-        res = self.log_contains(pattern, offset=offset)
-        assert res is not None
-        return res
-
-    def log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[Tuple[str, LogCursor]]:
-        """Check that the pageserver log contains a line that matches the given regex"""
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return None
-
-        contains_re = re.compile(pattern)
-
-        # XXX: Our rust logging machinery buffers the messages, so if you
-        # call this function immediately after it's been logged, there is
-        # no guarantee it is already present in the log file. This hasn't
-        # been a problem in practice, our python tests are not fast enough
-        # to hit that race condition.
-        skip_until_line_no = 0 if offset is None else offset._line_no
-        cur_line_no = 0
-        with logfile.open("r") as f:
-            for line in f:
-                if cur_line_no < skip_until_line_no:
-                    cur_line_no += 1
-                    continue
-                elif contains_re.search(line):
-                    # found it!
-                    cur_line_no += 1
-                    return (line, LogCursor(cur_line_no))
-                else:
-                    cur_line_no += 1
-        return None
-
     def tenant_attach(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index bc1f8776b3..63accebc7c 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,4 +1,5 @@
 import json
+import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -1259,6 +1260,53 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
+def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that when lock on resource (tenants, nodes) is held for too long it is
+    traced in logs.
+    """
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Lock on.*",
+            ".*Scheduling is disabled by policy.*",
+            f".*Operation TimelineCreate on key {tenant_id} has waited.*",
+        ]
+    )
+
+    # Apply failpoint
+    env.storage_controller.configure_failpoints(
+        ("tenant-update-policy-exclusive-lock", "return(31000)")
+    )
+
+    # This will hold the exclusive for enough time to cause an warning
+    def update_tenent_policy():
+        env.storage_controller.tenant_policy_update(
+            tenant_id=tenant_id,
+            body={
+                "scheduling": "Stop",
+            },
+        )
+
+    thread_update_tenant_policy = threading.Thread(target=update_tenent_policy)
+    thread_update_tenant_policy.start()
+
+    # Make sure the update policy thread has started
+    time.sleep(1)
+    # This will not be able to access and will log a warning
+    timeline_id = TimelineId.generate()
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+    )
+    thread_update_tenant_policy.join(timeout=10)
+
+    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
+    env.storage_controller.assert_log_contains(
+        f"Operation TimelineCreate on key {tenant_id} has waited"
+    )
+
+
 @pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
 @pytest.mark.parametrize("shard_count", [None, 4])
 def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage):

From 3582a95c8767fc39f037eed36e0fe3e1052443f2 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 3 May 2024 04:55:48 -0400
Subject: [PATCH 138/157] fix(pageserver): compile warning of
 download_object.ctx on macos (#7596)

fix macOS compile warning introduced in
https://github.com/neondatabase/neon/commit/45ec8688ea27cbad9789aac934a23069cbe95595

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 345a12aa86..b038f264f5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -136,7 +136,7 @@ async fn download_object<'a>(
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
     cancel: &CancellationToken,
-    ctx: &RequestContext,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     let res = match crate::virtual_file::io_engine::get() {
         crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),

From 60f570c70da0fec651b5fd5de0d551b60d5f53b6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 3 May 2024 13:11:51 +0300
Subject: [PATCH 139/157] refactor(update_gc_info): split GcInfo to compose out
 of GcCutoffs (#7584)

Split `GcInfo` and replace `Timeline::update_gc_info` with a method that
simply finds gc cutoffs `Timeline::find_gc_cutoffs` to be combined as
`Timeline::gc_info` at the caller.

This change will be followed up with a change that finds the GC cutoff
values before taking the `Tenant::gc_cs` lock.

Cc: #7560
---
 pageserver/src/metrics.rs         | 12 ++--
 pageserver/src/tenant.rs          | 87 ++++++++++++----------------
 pageserver/src/tenant/size.rs     |  8 ++-
 pageserver/src/tenant/timeline.rs | 96 +++++++++++++++----------------
 4 files changed, 95 insertions(+), 108 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d3c8c423e4..d8019b08e2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,8 +51,8 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "gc")]
     Gc,
 
-    #[strum(serialize = "update gc info")]
-    UpdateGcInfo,
+    #[strum(serialize = "find gc cutoffs")]
+    FindGcCutoffs,
 
     #[strum(serialize = "create tenant")]
     CreateTenant,
@@ -1989,7 +1989,7 @@ pub(crate) struct TimelineMetrics {
     pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
-    pub update_gc_info_histo: StorageTimeMetrics,
+    pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2050,8 +2050,8 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let update_gc_info_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::UpdateGcInfo,
+        let find_gc_cutoffs_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::FindGcCutoffs,
             &tenant_id,
             &shard_id,
             &timeline_id,
@@ -2098,7 +2098,7 @@ impl TimelineMetrics {
             logical_size_histo,
             imitate_logical_size_histo,
             garbage_collect_histo,
-            update_gc_info_histo,
+            find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
             resident_physical_size_gauge,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 05ceff2b59..a6cd1471ff 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,6 +62,7 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
+use self::timeline::GcInfo;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
@@ -86,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
-use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
@@ -2886,9 +2886,12 @@ impl Tenant {
                 ))
                 .map(|&x| x.1)
                 .collect();
-            timeline
-                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                .await?;
+            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+
+            *timeline.gc_info.write().unwrap() = GcInfo {
+                retain_lsns: branchpoints,
+                cutoffs,
+            };
 
             gc_timelines.push(timeline);
         }
@@ -2977,7 +2980,7 @@ impl Tenant {
         // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
+            let cutoff = gc_info.min_cutoff();
             if start_lsn < cutoff {
                 return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                     "invalid branch start lsn: less than planned GC cutoff {cutoff}"
@@ -4513,18 +4516,20 @@ mod tests {
     }
 
     async fn bulk_insert_compact_gc(
-        timeline: Arc<Timeline>,
+        tenant: &Tenant,
+        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         lsn: Lsn,
         repeat: usize,
         key_count: usize,
     ) -> anyhow::Result<()> {
         let compact = true;
-        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
+        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
     }
 
     async fn bulk_insert_maybe_compact_gc(
-        timeline: Arc<Timeline>,
+        tenant: &Tenant,
+        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         mut lsn: Lsn,
         repeat: usize,
@@ -4537,6 +4542,8 @@ mod tests {
         // Enforce that key range is monotonously increasing
         let mut keyspace = KeySpaceAccum::new();
 
+        let cancel = CancellationToken::new();
+
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
@@ -4558,24 +4565,19 @@ mod tests {
                 blknum += 1;
             }
 
-            let cutoff = timeline.get_last_record_lsn();
-
-            timeline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    ctx,
-                )
-                .await?;
             timeline.freeze_and_flush().await?;
             if compact {
-                timeline
-                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                    .await?;
+                // this requires timeline to be &Arc<Timeline>
+                timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
             }
-            timeline.gc().await?;
+
+            // this doesn't really need to use the timeline_id target, but it is closer to what it
+            // originally was.
+            let res = tenant
+                .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
+                .await?;
+
+            assert_eq!(res.layers_removed, 0, "this never removes anything");
         }
 
         Ok(())
@@ -4594,7 +4596,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         Ok(())
     }
@@ -4625,7 +4627,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
         guard.layer_map().dump(true, &ctx).await?;
@@ -5079,6 +5081,7 @@ mod tests {
             .await?;
 
         const NUM_KEYS: usize = 1000;
+        let cancel = CancellationToken::new();
 
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
@@ -5138,18 +5141,10 @@ mod tests {
             }
 
             // Perform a cycle of flush, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline.gc().await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
         }
 
         Ok(())
@@ -5170,6 +5165,8 @@ mod tests {
 
         let mut keyspace = KeySpaceAccum::new();
 
+        let cancel = CancellationToken::new();
+
         // Track when each page was last modified. Used to assert that
         // a read sees the latest page version.
         let mut updated = [Lsn(0); NUM_KEYS];
@@ -5233,21 +5230,11 @@ mod tests {
             }
 
             // Perform a cycle of flush, compact, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
-            tline.gc().await?;
         }
 
         Ok(())
@@ -5452,7 +5439,7 @@ mod tests {
 
         let lsn = Lsn(0x10);
         let compact = false;
-        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
+        bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
 
         let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let read_lsn = Lsn(u64::MAX - 1);
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index f521dfa55d..974c1091fd 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -192,7 +192,9 @@ pub(super) async fn gather_inputs(
         // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
         // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
         // horizon_cutoff.
-        let mut next_gc_cutoff = gc_info.pitr_cutoff;
+        let pitr_cutoff = gc_info.cutoffs.pitr;
+        let horizon_cutoff = gc_info.cutoffs.horizon;
+        let mut next_gc_cutoff = pitr_cutoff;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
@@ -297,8 +299,8 @@ pub(super) async fn gather_inputs(
             last_record: last_record_lsn,
             // this is not used above, because it might not have updated recently enough
             latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff: gc_info.horizon_cutoff,
-            pitr_cutoff: gc_info.pitr_cutoff,
+            horizon_cutoff,
+            pitr_cutoff,
             next_gc_cutoff,
             retention_param_cutoff,
         });
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 22bfa53445..7aeb3a6a59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -325,7 +325,7 @@ pub struct Timeline {
 
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
-    pub gc_info: std::sync::RwLock<GcInfo>,
+    pub(crate) gc_info: std::sync::RwLock<GcInfo>,
 
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
@@ -409,33 +409,59 @@ pub struct WalReceiverInfo {
     pub last_received_msg_ts: u128,
 }
 
-///
 /// Information about how much history needs to be retained, needed by
 /// Garbage Collection.
-///
-pub struct GcInfo {
+#[derive(Default)]
+pub(crate) struct GcInfo {
     /// Specific LSNs that are needed.
     ///
     /// Currently, this includes all points where child branches have
     /// been forked off from. In the future, could also include
     /// explicit user-defined snapshot points.
-    pub retain_lsns: Vec<Lsn>,
+    pub(crate) retain_lsns: Vec<Lsn>,
 
-    /// In addition to 'retain_lsns', keep everything newer than this
-    /// point.
+    /// The cutoff coordinates, which are combined by selecting the minimum.
+    pub(crate) cutoffs: GcCutoffs,
+}
+
+impl GcInfo {
+    pub(crate) fn min_cutoff(&self) -> Lsn {
+        self.cutoffs.select_min()
+    }
+}
+
+/// The `GcInfo` component describing which Lsns need to be retained.
+#[derive(Debug)]
+pub(crate) struct GcCutoffs {
+    /// Keep everything newer than this point.
     ///
     /// This is calculated by subtracting 'gc_horizon' setting from
     /// last-record LSN
     ///
     /// FIXME: is this inclusive or exclusive?
-    pub horizon_cutoff: Lsn,
+    pub(crate) horizon: Lsn,
 
     /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
     /// point.
     ///
     /// This is calculated by finding a number such that a record is needed for PITR
     /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub pitr_cutoff: Lsn,
+    pub(crate) pitr: Lsn,
+}
+
+impl Default for GcCutoffs {
+    fn default() -> Self {
+        Self {
+            horizon: Lsn::INVALID,
+            pitr: Lsn::INVALID,
+        }
+    }
+}
+
+impl GcCutoffs {
+    fn select_min(&self) -> Lsn {
+        std::cmp::min(self.horizon, self.pitr)
+    }
 }
 
 /// An error happened in a get() operation.
@@ -1155,7 +1181,7 @@ impl Timeline {
                                " - keyspace={:?} lsn={}"),
                        seq_err, keyspace, lsn) },
             (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later 
+                // Sequential get runs after vectored get, so it is possible for the later
                 // to time out while waiting for its ancestor's Lsn to become ready and for the
                 // former to succeed (it essentially has a doubled wait time).
             },
@@ -2097,11 +2123,7 @@ impl Timeline {
 
                 write_lock: tokio::sync::Mutex::new(None),
 
-                gc_info: std::sync::RwLock::new(GcInfo {
-                    retain_lsns: Vec::new(),
-                    horizon_cutoff: Lsn(0),
-                    pitr_cutoff: Lsn(0),
-                }),
+                gc_info: std::sync::RwLock::new(GcInfo::default()),
 
                 latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
@@ -4383,7 +4405,7 @@ impl Timeline {
         Ok(())
     }
 
-    /// Update information about which layer files need to be retained on
+    /// Find the Lsns above which layer files need to be retained on
     /// garbage collection. This is separate from actually performing the GC,
     /// and is updated more frequently, so that compaction can remove obsolete
     /// page versions more aggressively.
@@ -4391,17 +4413,6 @@ impl Timeline {
     /// TODO: that's wishful thinking, compaction doesn't actually do that
     /// currently.
     ///
-    /// The caller specifies how much history is needed with the 3 arguments:
-    ///
-    /// retain_lsns: keep a version of each page at these LSNs
-    /// cutoff_horizon: also keep everything newer than this LSN
-    /// pitr: the time duration required to keep data for PITR
-    ///
-    /// The 'retain_lsns' list is currently used to prevent removing files that
-    /// are needed by child timelines. In the future, the user might be able to
-    /// name additional points in time to retain. The caller is responsible for
-    /// collecting that information.
-    ///
     /// The 'cutoff_horizon' point is used to retain recent versions that might still be
     /// needed by read-only nodes. (As of this writing, the caller just passes
     /// the latest LSN subtracted by a constant, and doesn't do anything smart
@@ -4409,26 +4420,17 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
-    ///
-    /// NOTE: This function holds a short-lived lock to protect the 'gc_info'
-    /// field, so that the three values passed as argument are stored
-    /// atomically. But the caller is responsible for ensuring that no new
-    /// branches are created that would need to be included in 'retain_lsns',
-    /// for example. The caller should hold `Tenant::gc_cs` lock to ensure
-    /// that.
-    ///
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub(super) async fn update_gc_info(
+    pub(super) async fn find_gc_cutoffs(
         &self,
-        retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<GcCutoffs> {
         let _timer = self
             .metrics
-            .update_gc_info_histo
+            .find_gc_cutoffs_histo
             .start_timer()
             .record_on_drop();
 
@@ -4481,14 +4483,10 @@ impl Timeline {
             self.get_last_record_lsn()
         };
 
-        // Grab the lock and update the values
-        *self.gc_info.write().unwrap() = GcInfo {
-            retain_lsns,
-            horizon_cutoff: cutoff_horizon,
-            pitr_cutoff,
-        };
-
-        Ok(())
+        Ok(GcCutoffs {
+            horizon: cutoff_horizon,
+            pitr: pitr_cutoff,
+        })
     }
 
     /// Garbage collect layer files on a timeline that are no longer needed.
@@ -4517,8 +4515,8 @@ impl Timeline {
         let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
             let gc_info = self.gc_info.read().unwrap();
 
-            let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.pitr_cutoff;
+            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
+            let pitr_cutoff = gc_info.cutoffs.pitr;
             let retain_lsns = gc_info.retain_lsns.clone();
             (horizon_cutoff, pitr_cutoff, retain_lsns)
         };

From d76963691f556566bfe08581b7cc32cdca5ee800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 3 May 2024 13:23:11 +0200
Subject: [PATCH 140/157] Increase Azure parallelism limit to 100 (#7597)

After #5563 has been addressed we can now set the Azure strorage
parallelism limit to 100 like it is for S3.

Part of #5567
---
 libs/remote_storage/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 32bc71c513..708662f20f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -55,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// We set this a little bit low as we currently buffer the entire file into RAM
+/// Set this limit analogously to the S3 limit
 ///
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

From 37b1930b2f6cb072087cdc011d12a91342a4afc9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 3 May 2024 12:40:09 +0100
Subject: [PATCH 141/157] tests: relax test download remote layers api (#7604)

## Problem
This test triggers layer download failures on demand. It is possible to
modify the failpoint
during a `Timeline::get_vectored` right between the vectored read and
it's validation read.
This means that one of the reads can fail while the other one succeeds
and vice versa.

## Summary of changes
These errors are expected, so allow them to happen.
---
 test_runner/regress/test_ondemand_download.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index ba0d53704b..6c2556f6a2 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -333,6 +333,17 @@ def test_download_remote_layers_api(
         }
     )
 
+    # This test triggers layer download failures on demand. It is possible to modify the failpoint
+    # during a `Timeline::get_vectored` right between the vectored read and it's validation read.
+    # This means that one of the reads can fail while the other one succeeds and vice versa.
+    # TODO(vlad): Remove this block once the vectored read path validation goes away.
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*"
+            ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*"
+        ]
+    )
+
     endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()

From b7385bb016a3264a5110e6309fff9fd218e95a97 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 May 2024 12:52:43 +0100
Subject: [PATCH 142/157] storage_controller: fix non-timeline passthrough GETs
 (#7602)

## Problem

We were matching on `/tenant/:tenant_id` and
`/tenant/:tenant_id/timeline*`, but not non-timeline tenant sub-paths.
There aren't many: this was only noticeable when using the
synthetic_size endpoint by hand.

## Summary of changes

- Change the wildcard from `/tenant/:tenant_id/timeline*` to
`/tenant/:tenant_id/*`
- Add test lines that exercise this
---
 storage_controller/src/http.rs                 | 11 ++++++-----
 test_runner/regress/test_storage_controller.py |  4 ++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index f9a79afb0d..604ad6fbaa 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -912,7 +912,7 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
-        // Tenant detail GET passthrough to shard zero
+        // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
                 r,
@@ -920,13 +920,14 @@ pub fn make_router(
                 RequestName("v1_tenant_passthrough"),
             )
         })
-        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
-        // timeline GET APIs will be implicitly included.
-        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+        // The `*` in the  URL is a wildcard: any tenant/timeline GET APIs on the pageserver
+        // are implicitly exposed here.  This must be last in the list to avoid
+        // taking precedence over other GET methods we might implement by hand.
+        .get("/v1/tenant/:tenant_id/*", |r| {
             tenant_service_handler(
                 r,
                 handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
+                RequestName("v1_tenant_passthrough"),
             )
         })
 }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 63accebc7c..fdcb4cf9a4 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -230,6 +230,10 @@ def test_storage_controller_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
+    (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant)
+    assert synthetic_size > 0
+    assert "segments" in size_inputs
+
     env.storage_controller.consistency_check()
 
 
From ed9a114bde38b971f49dd12b53163587477fdcc4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 3 May 2024 14:57:26 +0300
Subject: [PATCH 143/157] fix: find gc cutoff points without holding
 Tenant::gc_cs (#7585)

The current implementation of finding timeline gc cutoff Lsn(s) is done
while holding `Tenant::gc_cs`. In recent incidents long create branch
times were caused by holding the `Tenant::gc_cs` over extremely long
`Timeline::find_lsn_by_timestamp`. The fix is to find the GC cutoff
values before taking the `Tenant::gc_cs` lock. This change is safe to do
because the GC cutoff values and the branch points have no dependencies
on each other. In the case of `Timeline::find_gc_cutoff` taking a long
time with this change, we should no longer see `Tenant::gc_cs`
interfering with branch creation.

Additionally, the `Tenant::refresh_gc_info` is now tolerant of timeline
deletions (or any other failures to find the pitr_cutoff). This helps
with the synthetic size calculation being constantly completed instead
of having a break for a timely timeline deletion.

Fixes: #7560
Fixes: #7587
---
 pageserver/src/tenant.rs                | 75 +++++++++++++++++++++----
 pageserver/src/tenant/size.rs           |  5 +-
 pageserver/src/tenant/timeline.rs       |  2 +
 test_runner/regress/test_branching.py   | 24 ++++++++
 test_runner/regress/test_tenant_size.py | 67 +++++++++++++++++++++-
 5 files changed, 157 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a6cd1471ff..8fa484e7b2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,9 +62,9 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
-use self::timeline::GcInfo;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
+use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -2812,7 +2812,48 @@ impl Tenant {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
-        // grab mutex to prevent new timelines from being created here.
+        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
+        // currently visible timelines.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| match target_timeline_id.as_ref() {
+                Some(target) => &tl.timeline_id == target,
+                None => true,
+            })
+            .cloned()
+            .collect::<Vec<_>>();
+
+        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
+            HashMap::with_capacity(timelines.len());
+
+        for timeline in timelines.iter() {
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
+
+            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
+
+            match res {
+                Ok(cutoffs) => {
+                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+                    assert!(old.is_none());
+                }
+                Err(e) => {
+                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
+                }
+            }
+        }
+
+        if !self.is_active() {
+            anyhow::bail!("shutting down");
+        }
+
+        // grab mutex to prevent new timelines from being created here; avoid doing long operations
+        // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
@@ -2874,11 +2915,6 @@ impl Tenant {
                 }
             }
 
-            let cutoff = timeline
-                .get_last_record_lsn()
-                .checked_sub(horizon)
-                .unwrap_or(Lsn(0));
-
             let branchpoints: Vec<Lsn> = all_branchpoints
                 .range((
                     Included((timeline_id, Lsn(0))),
@@ -2886,12 +2922,27 @@ impl Tenant {
                 ))
                 .map(|&x| x.1)
                 .collect();
-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
 
-            *timeline.gc_info.write().unwrap() = GcInfo {
-                retain_lsns: branchpoints,
-                cutoffs,
-            };
+            {
+                let mut target = timeline.gc_info.write().unwrap();
+
+                match gc_cutoffs.remove(&timeline_id) {
+                    Some(cutoffs) => {
+                        *target = GcInfo {
+                            retain_lsns: branchpoints,
+                            cutoffs,
+                        };
+                    }
+                    None => {
+                        // reasons for this being unavailable:
+                        // - this timeline was created while we were finding cutoffs
+                        // - lsn for timestamp search fails for this timeline repeatedly
+                        //
+                        // in both cases, refreshing the branchpoints is correct.
+                        target.retain_lsns = branchpoints;
+                    }
+                };
+            }
 
             gc_timelines.push(timeline);
         }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 974c1091fd..64fff5536c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    //
-    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
-    // whole computation. It does not make sense from the billing perspective.
     tenant
         .refresh_gc_info(cancel, ctx)
         .await
@@ -221,6 +218,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        drop(gc_info);
+
         // Add branch points we collected earlier, just in case there were any that were
         // not present in retain_lsns. We will remove any duplicates below later.
         if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7aeb3a6a59..19228bc1f1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4434,6 +4434,8 @@ impl Timeline {
             .start_timer()
             .record_on_drop();
 
+        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
+
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
         // Some unit tests depend on garbage-collection working even when
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 5b69649007..9fe9f77fea 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,6 +1,7 @@
 import random
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
@@ -405,6 +406,29 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
     assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
 
 
+def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    client = env.pageserver.http_client()
+
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None)
+
+        wait_until_paused(env, failpoint)
+
+        env.neon_cli.create_branch(
+            tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+
+        completion.result()
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index a588f6ab53..53da548524 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,4 +1,5 @@
 import os
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import List, Tuple
 
@@ -11,13 +12,15 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    tenant_delete_wait_completed,
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 
 
 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -616,6 +619,68 @@ def test_get_tenant_size_with_multiple_branches(
     size_debug_file.write(size_debug)
 
 
+def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
+    """
+    Makes sure synthetic size can still be calculated even if one of the
+    timelines is deleted or the tenant is deleted.
+    """
+
+    env = neon_env_builder.init_start()
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+    client = env.pageserver.http_client()
+
+    orig_size = client.tenant_size(env.initial_tenant)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        _, last_offset = wait_until(
+            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
+
+        client.configure_failpoints((failpoint, "off"))
+        size = completion.result()
+
+        assert_size_approx_equal(orig_size, size)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        wait_until(
+            10,
+            1.0,
+            lambda: env.pageserver.assert_log_contains(
+                f"at failpoint {failpoint}", offset=last_offset
+            ),
+        )
+
+        tenant_delete_wait_completed(client, env.initial_tenant, 10)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        with pytest.raises(
+            PageserverApiException, match="Failed to refresh gc_info before gathering inputs"
+        ):
+            completion.result()
+
+    # this happens on both cases
+    env.pageserver.allowed_errors.append(
+        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
+    )
+    # this happens only in the case of deletion (http response logging)
+    env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")
+
+
 # Helper for tests that compare timeline_inputs
 # We don't want to compare the exact values, because they can be unstable
 # and cause flaky tests. So replace the values with useful invariants.

From 8b4dd5dc277164dbb175319c39ee7b64ed9f9f91 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 May 2024 13:31:25 +0100
Subject: [PATCH 144/157] pageserver: jitter secondary periods (#7544)

## Problem

After some time the load from heatmap uploads gets rather spiky. They're
unintentionally synchronising.

Chart (does this make a _boing_ sound in anyone else's head?):

![image](https://github.com/neondatabase/neon/assets/944640/18829fc8-c5b7-4739-9a9b-491b5d6fcade)


## Summary of changes

- Add a helper `period_jitter` and apply a 5% jitter from downloader and
heatmap_uploader when updating the next runtime at the end of an
interation.
- Refactor existing places that we pick a startup interval into
`period_warmup`, so that the intent is obvious.
---
 pageserver/src/tenant/secondary/downloader.rs | 16 +++++++-------
 .../src/tenant/secondary/heatmap_uploader.rs  | 22 +++++++++----------
 pageserver/src/tenant/secondary/scheduler.rs  | 21 ++++++++++++++++++
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8a987b5ade..fb8907b5a8 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -31,7 +31,10 @@ use crate::{
 
 use super::{
     heatmap::HeatMapLayer,
-    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    scheduler::{
+        self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
+        TenantBackgroundJobs,
+    },
     SecondaryTenant,
 };
 
@@ -45,7 +48,6 @@ use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
@@ -274,7 +276,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
         // take priority to run again.
         let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -305,11 +307,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     }
 
                     if detail.next_download.is_none() {
-                        // Initialize with a jitter: this spreads initial downloads on startup
-                        // or mass-attach across our freshen interval.
-                        let jittered_period =
-                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
-                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
+                        // rounds will use a smaller jitter to avoid accidentally synchronizing later.
+                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
                         "Using our constant, which is known to be small compared with clock range",
                     ));
                     }
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 39d088ffc3..352409f5fc 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -20,12 +20,14 @@ use crate::{
 
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
 
 use super::{
     heatmap::HeatMapTenant,
-    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    scheduler::{
+        self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
+        TenantBackgroundJobs,
+    },
     CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
@@ -181,15 +183,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             let state = self
                 .tenants
                 .entry(*tenant.get_tenant_shard_id())
-                .or_insert_with(|| {
-                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
-
-                    UploaderTenantState {
-                        tenant: Arc::downgrade(&tenant),
-                        last_upload: None,
-                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
-                        last_digest: None,
-                    }
+                .or_insert_with(|| UploaderTenantState {
+                    tenant: Arc::downgrade(&tenant),
+                    last_upload: None,
+                    next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
+                    last_digest: None,
                 });
 
             // Decline to do the upload if insufficient time has passed
@@ -274,7 +272,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
             let next_upload = tenant
                 .get_heatmap_period()
-                .and_then(|period| now.checked_add(period));
+                .and_then(|period| now.checked_add(period_jitter(period, 5)));
 
             WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 3bd7be782e..3d042f4513 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -1,4 +1,5 @@
 use futures::Future;
+use rand::Rng;
 use std::{
     collections::HashMap,
     marker::PhantomData,
@@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse};
 const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
 const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
 
+/// Jitter a Duration by an integer percentage.  Returned values are uniform
+/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range)
+pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration {
+    if d == Duration::ZERO {
+        d
+    } else {
+        rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
+    }
+}
+
+/// When a periodic task first starts, it should wait for some time in the range 0..period, so
+/// that starting many such tasks at the same time spreads them across the time range.
+pub(super) fn period_warmup(period: Duration) -> Duration {
+    if period == Duration::ZERO {
+        period
+    } else {
+        rand::thread_rng().gen_range(Duration::ZERO..period)
+    }
+}
+
 /// Scheduling helper for background work across many tenants.
 ///
 /// Systems that need to run background work across many tenants may use this type

From 426598cf76d5cc77471b000b9d9880df5059cfa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 3 May 2024 15:59:28 +0200
Subject: [PATCH 145/157] Update rust to 1.78.0 (#7598)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

Release notes: https://blog.rust-lang.org/2024/05/02/Rust-1.78.0.html

Prior update was in #7198
---
 Dockerfile.build-tools                        |  4 +--
 compute_tools/src/spec.rs                     |  2 +-
 control_plane/src/local_env.rs                |  5 ++-
 libs/pageserver_api/src/shard.rs              |  6 ++--
 pageserver/src/pgdatadir_mapping.rs           | 33 ++++++++++---------
 pageserver/src/tenant/layer_map.rs            | 12 +++----
 .../walreceiver/connection_manager.rs         |  4 +--
 .../virtual_file/owned_buffers_io/write.rs    |  7 ++--
 rust-toolchain.toml                           |  2 +-
 9 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a082f15c34..19739cc1f8 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
     && mv s5cmd /usr/local/bin/s5cmd
 
 # LLVM
-ENV LLVM_VERSION=17
+ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 269177ee16..3a6e18b638 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 "rename_db" => {
                     let new_name = op.new_name.as_ref().unwrap();
 
-                    if existing_dbs.get(&op.name).is_some() {
+                    if existing_dbs.contains_key(&op.name) {
                         let query: String = format!(
                             "ALTER DATABASE {} RENAME TO {}",
                             op.name.pg_quote(),
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 59b587389c..6437d04ec8 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -382,7 +382,10 @@ impl LocalEnv {
 
         // Find neon binaries.
         if env.neon_distrib_dir == Path::new("") {
-            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+            env::current_exe()?
+                .parent()
+                .unwrap()
+                .clone_into(&mut env.neon_distrib_dir);
         }
 
         if env.pageservers.is_empty() {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index d769b2fd2f..ff6d3d91b6 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -97,7 +97,7 @@ impl ShardCount {
 
     /// The internal value of a ShardCount may be zero, which means "1 shard, but use
     /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as `TenantShardId::unsharded`.
+    /// as [`TenantShardId::unsharded`].
     ///
     /// This method returns the actual number of shards, i.e. if our internal value is
     /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -116,7 +116,9 @@ impl ShardCount {
         self.0
     }
 
-    ///
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
     pub fn is_unsharded(&self) -> bool {
         self.0 == 0
     }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 015191b875..12314c5961 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -279,7 +279,7 @@ impl Timeline {
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
-                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
                 Ok(exists)
             }
             Err(e) => Err(PageReconstructError::from(e)),
@@ -379,7 +379,7 @@ impl Timeline {
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
-                let exists = dir.segments.get(&segno).is_some();
+                let exists = dir.segments.contains(&segno);
                 Ok(exists)
             }
             Err(e) => Err(PageReconstructError::from(e)),
@@ -1143,21 +1143,22 @@ impl<'a> DatadirModification<'a> {
         let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
             .context("deserialize db")?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
-            // Didn't exist. Update dbdir
-            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
-            self.put(DBDIR_KEY, Value::Image(buf.into()));
+        let mut rel_dir =
+            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
+                // Didn't exist. Update dbdir
+                e.insert(false);
+                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                self.pending_directory_entries
+                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.put(DBDIR_KEY, Value::Image(buf.into()));
 
-            // and create the RelDirectory
-            RelDirectory::default()
-        } else {
-            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
-        };
+                // and create the RelDirectory
+                RelDirectory::default()
+            } else {
+                // reldir already exists, fetch it
+                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                    .context("deserialize db")?
+            };
 
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 3c4de8fe4d..2724a5cc07 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -588,7 +588,7 @@ impl LayerMap {
             let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
             coverage.push((kr, current_val.take()));
             current_key = change_key;
-            current_val = change_val.clone();
+            current_val.clone_from(&change_val);
         }
 
         // Add the final interval
@@ -672,12 +672,12 @@ impl LayerMap {
         // Loop through the delta coverage and recurse on each part
         for (change_key, change_val) in version.delta_coverage.range(start..end) {
             // If there's a relevant delta in this part, add 1 and recurse down
-            if let Some(val) = current_val {
+            if let Some(val) = &current_val {
                 if val.get_lsn_range().end > lsn.start {
                     let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                     let lr = lsn.start..val.get_lsn_range().start;
                     if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count = Self::is_reimage_worthy(val, key) as usize;
                         let new_limit = limit.map(|l| l - base_count);
                         let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                         max_stacked_deltas = std::cmp::max(
@@ -689,17 +689,17 @@ impl LayerMap {
             }
 
             current_key = change_key;
-            current_val = change_val.clone();
+            current_val.clone_from(&change_val);
         }
 
         // Consider the last part
-        if let Some(val) = current_val {
+        if let Some(val) = &current_val {
             if val.get_lsn_range().end > lsn.start {
                 let kr = Key::from_i128(current_key)..Key::from_i128(end);
                 let lr = lsn.start..val.get_lsn_range().start;
 
                 if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count = Self::is_reimage_worthy(val, key) as usize;
                     let new_limit = limit.map(|l| l - base_count);
                     let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                     max_stacked_deltas = std::cmp::max(
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 7ef063c4e5..991e4ac045 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1535,7 +1535,7 @@ mod tests {
 
         let harness = TenantHarness::create("switch_to_same_availability_zone")?;
         let mut state = dummy_state(&harness).await;
-        state.conf.availability_zone = test_az.clone();
+        state.conf.availability_zone.clone_from(&test_az);
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
 
@@ -1568,7 +1568,7 @@ mod tests {
         // We have another safekeeper with the same commit_lsn, and it have the same availability zone as
         // the current pageserver.
         let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
-        same_az_sk.timeline.availability_zone = test_az.clone();
+        same_az_sk.timeline.availability_zone.clone_from(&test_az);
 
         state.wal_stream_candidates = HashMap::from([
             (
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index ac5169508f..885a9221c5 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -76,14 +76,11 @@ where
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf>(
+    pub async fn write_buffered<S: IoBuf + Send>(
         &mut self,
         chunk: Slice<S>,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, S)>
-    where
-        S: IoBuf + Send,
-    {
+    ) -> std::io::Result<(usize, S)> {
         let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
         if chunk.len() >= self.buf().cap() {
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 50a5a4185b..214de0a77d 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.77.0"
+channel = "1.78.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From ce0ddd749c5945f0660ec0f9327c8aacc77f4666 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 3 May 2024 16:05:00 +0200
Subject: [PATCH 146/157] test_runner: remove unused
 `NeonPageserver.config_override` field (#7605)

refs https://github.com/neondatabase/neon/issues/7555
---
 test_runner/fixtures/neon_fixtures.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 19aa4cc886..90884ad7f8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1104,7 +1104,6 @@ class NeonEnv:
                     self,
                     ps_id,
                     port=pageserver_port,
-                    config_override=self.pageserver_config_override,
                 )
             )
             cfg["pageservers"].append(ps_cfg)
@@ -2373,15 +2372,12 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     TEMP_FILE_SUFFIX = "___temp"
 
-    def __init__(
-        self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None
-    ):
+    def __init__(self, env: NeonEnv, id: int, port: PageserverPort):
         super().__init__(host="localhost", port=port.pg, user="cloud_admin")
         self.env = env
         self.id = id
         self.running = False
         self.service_port = port
-        self.config_override = config_override
         self.version = env.get_binary_version("pageserver")
         self.logfile = self.workdir / "pageserver.log"
         # After a test finishes, we will scrape the log to see if there are any

From b5a6e68e686128652b491aa3fb6cfcfdc0a611ad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 3 May 2024 15:28:23 +0100
Subject: [PATCH 147/157] storage controller: check warmth of secondary before
 doing proactive migration (#7583)

## Problem

The logic in Service::optimize_all would sometimes choose to migrate a
tenant to a secondary location that was only recently created, resulting
in Reconciler::live_migrate hitting its 5 minute timeout warming up the
location, and proceeding to attach a tenant to a location that doesn't
have a warm enough local set of layer files for good performance.

Closes: #7532

## Summary of changes

- Add a pageserver API for checking download progress of a secondary
location
- During `optimize_all`, connect to pageservers of candidate
optimization secondary locations, and check they are warm.
- During shard split, do heatmap uploads and start secondary downloads,
so that the new shards' secondary locations start downloading ASAP,
rather than waiting minutes for background downloads to kick in.

I have intentionally not implemented this by continuously reading the
status of locations, to avoid dealing with the scale challenge of
efficiently polling & updating 10k-100k locations status. If we
implement that in the future, then this code can be simplified to act
based on latest state of a location rather than fetching it inline
during optimize_all.
---
 pageserver/client/src/mgmt_api.rs           |  28 ++
 pageserver/src/http/routes.rs               |  24 ++
 storage_controller/src/pageserver_client.rs |  21 ++
 storage_controller/src/service.rs           | 352 ++++++++++++++++++--
 storage_controller/src/tenant_shard.rs      |  87 +++--
 test_runner/regress/test_sharding.py        |  22 +-
 6 files changed, 471 insertions(+), 63 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index bc66c5c6e1..6df8b2170d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -284,6 +284,34 @@ impl Client {
         Ok((status, progress))
     }
 
+    pub async fn tenant_secondary_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<SecondaryProgress> {
+        let path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/secondary/status",
+            self.mgmt_api_endpoint, tenant_shard_id
+        ))
+        .expect("Cannot build URL");
+
+        self.request(Method::GET, path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
+        let path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/heatmap_upload",
+            self.mgmt_api_endpoint, tenant_id
+        ))
+        .expect("Cannot build URL");
+
+        self.request(Method::POST, path, ()).await?;
+        Ok(())
+    }
+
     pub async fn location_config(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ae1e7aac78..cf526940f4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2160,6 +2160,27 @@ async fn secondary_download_handler(
     json_response(status, progress)
 }
 
+async fn secondary_status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
+    let Some(secondary_tenant) = state
+        .tenant_manager
+        .get_secondary_tenant_shard(tenant_shard_id)
+    else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
+        ));
+    };
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    json_response(StatusCode::OK, progress)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -2521,6 +2542,9 @@ pub fn make_router(
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
+        .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| {
+            api_handler(r, secondary_status_handler)
+        })
         .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
             api_handler(r, secondary_download_handler)
         })
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 0cea205599..25b6b67e12 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -114,6 +114,27 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn tenant_secondary_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<SecondaryProgress> {
+        measured_request!(
+            "tenant_secondary_status",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.tenant_secondary_status(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
+        measured_request!(
+            "tenant_heatmap_upload",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.tenant_heatmap_upload(tenant_id).await
+        )
+    }
+
     pub(crate) async fn location_config(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index eaff87d1ce..d3a53066c9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -13,7 +13,9 @@ use crate::{
     persistence::{AbortShardSplitStatus, TenantFilter},
     reconciler::{ReconcileError, ReconcileUnits},
     scheduler::{ScheduleContext, ScheduleMode},
-    tenant_shard::ReconcileNeeded,
+    tenant_shard::{
+        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
+    },
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -709,7 +711,7 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all();
+                    self.optimize_all().await;
                 }
             }
               _ = self.cancel.cancelled() => return
@@ -2639,6 +2641,45 @@ impl Service {
         Ok(results)
     }
 
+    /// Concurrently invoke a pageserver API call on many shards at once
+    pub(crate) async fn tenant_for_shards_api<T, O, F>(
+        &self,
+        locations: Vec<(TenantShardId, Node)>,
+        op: O,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> Vec<mgmt_api::Result<T>>
+    where
+        O: Fn(TenantShardId, PageserverClient) -> F + Copy,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        let mut futs = FuturesUnordered::new();
+        let mut results = Vec::with_capacity(locations.len());
+
+        for (tenant_shard_id, node) in locations {
+            futs.push(async move {
+                node.with_client_retries(
+                    |client| op(tenant_shard_id, client),
+                    &self.config.jwt_token,
+                    warn_threshold,
+                    max_retries,
+                    timeout,
+                    cancel,
+                )
+                .await
+            });
+        }
+
+        while let Some(r) = futs.next().await {
+            let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled));
+            results.push(r);
+        }
+
+        results
+    }
+
     pub(crate) async fn tenant_timeline_delete(
         &self,
         tenant_id: TenantId,
@@ -3088,11 +3129,14 @@ impl Service {
     ) -> (
         TenantShardSplitResponse,
         Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        Vec<ReconcilerWaiter>,
     ) {
         let mut response = TenantShardSplitResponse {
             new_shards: Vec::new(),
         };
         let mut child_locations = Vec::new();
+        let mut waiters = Vec::new();
+
         {
             let mut locked = self.inner.write().unwrap();
 
@@ -3171,14 +3215,112 @@ impl Service {
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
                     // In the background, attach secondary locations for the new shards
-                    self.maybe_reconcile_shard(&mut child_state, nodes);
+                    if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) {
+                        waiters.push(waiter);
+                    }
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
                 }
             }
+            (response, child_locations, waiters)
+        }
+    }
 
-            (response, child_locations)
+    async fn tenant_shard_split_start_secondaries(
+        &self,
+        tenant_id: TenantId,
+        waiters: Vec<ReconcilerWaiter>,
+    ) {
+        // Wait for initial reconcile of child shards, this creates the secondary locations
+        if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
+            // This is not a failure to split: it's some issue reconciling the new child shards, perhaps
+            // their secondaries couldn't be attached.
+            tracing::warn!("Failed to reconcile after split: {e}");
+            return;
+        }
+
+        // Take the state lock to discover the attached & secondary intents for all shards
+        let (attached, secondary) = {
+            let locked = self.inner.read().unwrap();
+            let mut attached = Vec::new();
+            let mut secondary = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let Some(node_id) = shard.intent.get_attached() else {
+                    // Unexpected.  Race with a PlacementPolicy change?
+                    tracing::warn!(
+                        "No attached node on {tenant_shard_id} immediately after shard split!"
+                    );
+                    continue;
+                };
+
+                let Some(secondary_node_id) = shard.intent.get_secondary().first() else {
+                    // No secondary location.  Nothing for us to do.
+                    continue;
+                };
+
+                let attached_node = locked
+                    .nodes
+                    .get(node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                let secondary_node = locked
+                    .nodes
+                    .get(secondary_node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                attached.push((*tenant_shard_id, attached_node.clone()));
+                secondary.push((*tenant_shard_id, secondary_node.clone()));
+            }
+            (attached, secondary)
+        };
+
+        if secondary.is_empty() {
+            // No secondary locations; nothing for us to do
+            return;
+        }
+
+        for result in self
+            .tenant_for_shards_api(
+                attached,
+                |tenant_shard_id, client| async move {
+                    client.tenant_heatmap_upload(tenant_shard_id).await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            if let Err(e) = result {
+                tracing::warn!("Error calling heatmap upload after shard split: {e}");
+                return;
+            }
+        }
+
+        for result in self
+            .tenant_for_shards_api(
+                secondary,
+                |tenant_shard_id, client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO))
+                        .await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            if let Err(e) = result {
+                tracing::warn!("Error calling secondary download after shard split: {e}");
+                return;
+            }
         }
     }
 
@@ -3212,8 +3354,8 @@ impl Service {
             .do_tenant_shard_split(tenant_id, shard_split_params)
             .await;
 
-        match r {
-            Ok(r) => Ok(r),
+        let (response, waiters) = match r {
+            Ok(r) => r,
             Err(e) => {
                 // Split might be part-done, we must do work to abort it.
                 tracing::warn!("Enqueuing background abort of split on {tenant_id}");
@@ -3226,9 +3368,17 @@ impl Service {
                     })
                     // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                     .ok();
-                Err(e)
+                return Err(e);
             }
-        }
+        };
+
+        // The split is now complete.  As an optimization, we will trigger all the child shards to upload
+        // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting
+        // for the background heatmap/download interval before secondaries get warm enough to migrate shards
+        // in [`Self::optimize_all`]
+        self.tenant_shard_split_start_secondaries(tenant_id, waiters)
+            .await;
+        Ok(response)
     }
 
     fn prepare_tenant_shard_split(
@@ -3378,7 +3528,7 @@ impl Service {
         &self,
         tenant_id: TenantId,
         params: ShardSplitParams,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
+    ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
         // parent shards exist as expected, but it would be neater to do the above pre-checks within the
@@ -3580,7 +3730,7 @@ impl Service {
         ));
 
         // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) =
+        let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
         // Send compute notifications for all the new shards
@@ -3607,7 +3757,7 @@ impl Service {
             }
         }
 
-        Ok(response)
+        Ok((response, waiters))
     }
 
     pub(crate) async fn tenant_shard_migrate(
@@ -4373,25 +4523,68 @@ impl Service {
     /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
     /// the time of scheduling, this function looks for cases where a better-scoring location is available
     /// according to those same soft constraints.
-    fn optimize_all(&self) -> usize {
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        let pageservers = nodes.clone();
-
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut reconciles_spawned = 0;
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
+    async fn optimize_all(&self) -> usize {
         // Limit on how many shards' optmizations each call to this function will execute.  Combined
         // with the frequency of background calls, this acts as an implicit rate limit that runs a small
         // trickle of optimizations in the background, rather than executing a large number in parallel
         // when a change occurs.
-        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
+        const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2;
+
+        // Synchronous prepare: scan shards for possible scheduling optimizations
+        let candidate_work = self.optimize_all_plan();
+        let candidate_work_len = candidate_work.len();
+
+        // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation
+        let validated_work = self.optimize_all_validate(candidate_work).await;
+
+        let was_work_filtered = validated_work.len() != candidate_work_len;
+
+        // Synchronous apply: update the shards' intent states according to validated optimisations
+        let mut reconciles_spawned = 0;
+        let mut optimizations_applied = 0;
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        for (tenant_shard_id, optimization) in validated_work {
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                // Shard was dropped between planning and execution;
+                continue;
+            };
+            if shard.apply_optimization(scheduler, optimization) {
+                optimizations_applied += 1;
+                if self.maybe_reconcile_shard(shard, nodes).is_some() {
+                    reconciles_spawned += 1;
+                }
+            }
+
+            if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS {
+                break;
+            }
+        }
+
+        if was_work_filtered {
+            // If we filtered any work out during validation, ensure we return a nonzero value to indicate
+            // to callers that the system is not in a truly quiet state, it's going to do some work as soon
+            // as these validations start passing.
+            reconciles_spawned = std::cmp::max(reconciles_spawned, 1);
+        }
+
+        reconciles_spawned
+    }
+
+    fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> {
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
+
+        // How many candidate optimizations we will generate, before evaluating them for readniess: setting
+        // this higher than the execution limit gives us a chance to execute some work even if the first
+        // few optimizations we find are not ready.
+        const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
 
         let mut work = Vec::new();
 
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
         for (tenant_shard_id, shard) in tenants.iter() {
             if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
@@ -4400,7 +4593,7 @@ impl Service {
                 tenant_shards.clear();
             }
 
-            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
+            if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
                 break;
             }
 
@@ -4472,18 +4665,105 @@ impl Service {
             }
         }
 
-        for (tenant_shard_id, optimization) in work {
-            let shard = tenants
-                .get_mut(&tenant_shard_id)
-                .expect("We held lock from place we got this ID");
-            shard.apply_optimization(scheduler, optimization);
+        work
+    }
 
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
-                reconciles_spawned += 1;
+    async fn optimize_all_validate(
+        &self,
+        candidate_work: Vec<(TenantShardId, ScheduleOptimization)>,
+    ) -> Vec<(TenantShardId, ScheduleOptimization)> {
+        // Take a clone of the node map to use outside the lock in async validation phase
+        let validation_nodes = { self.inner.read().unwrap().nodes.clone() };
+
+        let mut want_secondary_status = Vec::new();
+
+        // Validate our plans: this is an async phase where we may do I/O to pageservers to
+        // check that the state of locations is acceptable to run the optimization, such as
+        // checking that a secondary location is sufficiently warmed-up to cleanly cut over
+        // in a live migration.
+        let mut validated_work = Vec::new();
+        for (tenant_shard_id, optimization) in candidate_work {
+            match optimization.action {
+                ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: _,
+                    new_attached_node_id,
+                }) => {
+                    match validation_nodes.get(&new_attached_node_id) {
+                        None => {
+                            // Node was dropped between planning and validation
+                        }
+                        Some(node) => {
+                            if !node.is_available() {
+                                tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable");
+                            } else {
+                                // Accumulate optimizations that require fetching secondary status, so that we can execute these
+                                // remote API requests concurrently.
+                                want_secondary_status.push((
+                                    tenant_shard_id,
+                                    node.clone(),
+                                    optimization,
+                                ));
+                            }
+                        }
+                    }
+                }
+                ScheduleOptimizationAction::ReplaceSecondary(_) => {
+                    // No extra checks needed to replace a secondary: this does not interrupt client access
+                    validated_work.push((tenant_shard_id, optimization))
+                }
+            };
+        }
+
+        // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we
+        // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler
+        // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up.
+        let results = self
+            .tenant_for_shards_api(
+                want_secondary_status
+                    .iter()
+                    .map(|i| (i.0, i.1.clone()))
+                    .collect(),
+                |tenant_shard_id, client| async move {
+                    client.tenant_secondary_status(tenant_shard_id).await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        for ((tenant_shard_id, node, optimization), secondary_status) in
+            want_secondary_status.into_iter().zip(results.into_iter())
+        {
+            match secondary_status {
+                Err(e) => {
+                    tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}");
+                }
+                Ok(progress) => {
+                    // We require secondary locations to have less than 10GiB of downloads pending before we will use
+                    // them in an optimization
+                    const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024;
+
+                    if progress.bytes_total == 0
+                        || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD
+                            && progress.bytes_downloaded != progress.bytes_total
+                        || progress.bytes_total - progress.bytes_downloaded
+                            > DOWNLOAD_FRESHNESS_THRESHOLD
+                    {
+                        tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+                    } else {
+                        // Location looks ready: proceed
+                        tracing::info!(
+                            "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}"
+                        );
+                        validated_work.push((tenant_shard_id, optimization))
+                    }
+                }
             }
         }
 
-        reconciles_spawned
+        validated_work
     }
 
     /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
@@ -4491,10 +4771,12 @@ impl Service {
     /// put the system into a quiescent state where future background reconciliations won't do anything.
     pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
         let reconciles_spawned = self.reconcile_all();
-        if reconciles_spawned == 0 {
+        let reconciles_spawned = if reconciles_spawned == 0 {
             // Only optimize when we are otherwise idle
-            self.optimize_all();
-        }
+            self.optimize_all().await
+        } else {
+            reconciles_spawned
+        };
 
         let waiters = {
             let mut waiters = Vec::new();
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index ffbf2c4b7a..dda17f9887 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -325,18 +325,28 @@ pub(crate) struct ReplaceSecondary {
 
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
+    pub(crate) old_attached_node_id: NodeId,
+    pub(crate) new_attached_node_id: NodeId,
 }
 
 #[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
+pub(crate) enum ScheduleOptimizationAction {
     // Replace one of our secondary locations with a different node
     ReplaceSecondary(ReplaceSecondary),
     // Migrate attachment to an existing secondary location
     MigrateAttachment(MigrateAttachment),
 }
 
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct ScheduleOptimization {
+    // What was the reconcile sequence when we generated this optimization?  The optimization
+    // should only be applied if the shard's sequence is still at this value, in case other changes
+    // happened between planning the optimization and applying it.
+    sequence: Sequence,
+
+    pub(crate) action: ScheduleOptimizationAction,
+}
+
 impl ReconcilerWaiter {
     pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
         tokio::select! {
@@ -675,10 +685,13 @@ impl TenantShard {
                         "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
                         self.intent.get_secondary()
                     );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                            old_attached_node_id: attached,
+                            new_attached_node_id: *preferred_node,
+                        }),
+                    });
                 }
             } else {
                 tracing::debug!(
@@ -736,28 +749,37 @@ impl TenantShard {
                     "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
                     self.intent.get_secondary()
                 );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                        old_node_id: *secondary,
+                        new_node_id: candidate_node,
+                    }),
+                });
             }
         }
 
         None
     }
 
+    /// Return true if the optimization was really applied: it will not be applied if the optimization's
+    /// sequence is behind this tenant shard's
     pub(crate) fn apply_optimization(
         &mut self,
         scheduler: &mut Scheduler,
         optimization: ScheduleOptimization,
-    ) {
+    ) -> bool {
+        if optimization.sequence != self.sequence {
+            return false;
+        }
+
         metrics::METRICS_REGISTRY
             .metrics_group
             .storage_controller_schedule_optimization
             .inc();
 
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+        match optimization.action {
+            ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
                 old_attached_node_id,
                 new_attached_node_id,
             }) => {
@@ -765,7 +787,7 @@ impl TenantShard {
                 self.intent
                     .promote_attached(scheduler, new_attached_node_id);
             }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                 old_node_id,
                 new_node_id,
             }) => {
@@ -773,6 +795,8 @@ impl TenantShard {
                 self.intent.push_secondary(scheduler, new_node_id);
             }
         }
+
+        true
     }
 
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
@@ -1428,10 +1452,13 @@ pub(crate) mod tests {
         // would be no other shards from the same tenant, and request to do so.
         assert_eq!(
             optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(2)
+                })
+            })
         );
 
         // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
@@ -1442,10 +1469,13 @@ pub(crate) mod tests {
         let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
         assert_eq!(
             optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_b.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(3)
+                })
+            })
         );
 
         // Applying these optimizations should result in the end state proposed
@@ -1489,10 +1519,13 @@ pub(crate) mod tests {
         // same tenant should generate an optimization to move one away
         assert_eq!(
             optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                    old_node_id: NodeId(3),
+                    new_node_id: NodeId(4)
+                })
+            })
         );
 
         shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bae5945338..258377f8a2 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -287,6 +287,11 @@ def test_sharding_split_smoke(
         == shard_count
     )
 
+    # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment
+    # during post-split optimization until the secondary is ready
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
     env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
@@ -300,7 +305,7 @@ def test_sharding_split_smoke(
 
     # Enough background reconciliations should result in the shards being properly distributed.
     # Run this before the workload, because its LSN-waiting code presumes stable locations.
-    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.reconcile_until_idle(timeout_secs=60)
 
     workload.validate()
 
@@ -342,6 +347,10 @@ def test_sharding_split_smoke(
     assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
     assert errored_reconciles is not None and int(errored_reconciles) == 0
 
+    # We should see that the migration of shards after the split waited for secondaries to warm up
+    # before happening
+    assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*")
+
     env.storage_controller.consistency_check()
 
     def get_node_shard_counts(env: NeonEnv, tenant_ids):
@@ -1071,6 +1080,17 @@ def test_sharding_split_failures(
         finish_split()
         assert_split_done()
 
+    if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint:
+        # On a post-complete failure, the controller will recover the post-split state
+        # after restart, but it will have missed the optimization part of the split function
+        # where secondary downloads are kicked off.  This means that reconcile_until_idle
+        # will take a very long time if we wait for all optimizations to complete, because
+        # those optimizations will wait for secondary downloads.
+        #
+        # Avoid that by configuring the tenant into Essential scheduling mode, so that it will
+        # skip optimizations when we're exercising this particular failpoint.
+        env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
+
     # Having completed the split, pump the background reconciles to ensure that
     # the scheduler reaches an idle state
     env.storage_controller.reconcile_until_idle(timeout_secs=30)

From a3fe12b6d898205bddae4f06947841e14c98ff8e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 3 May 2024 10:43:30 -0400
Subject: [PATCH 148/157] feat(pageserver): add scan interface (#7468)

This pull request adds the scan interface. Scan operates on a sparse
keyspace and retrieves all the key-value pairs from the keyspaces.

Currently, scan only supports the metadata keyspace, and by default do
not retrieve anything from the ancestor branch. This should be fixed in
the future if we need to have some keyspaces that inherits from the
parent.

The scan interface reuses the vectored get code path by disabling the
missing key errors.

This pull request also changes the behavior of vectored get on aux file
v1/v2 key/keyspace: if the key is not found, it is simply not included in the
result, instead of throwing a missing key error.

TODOs in future pull requests: limit memory consumption, ensure the
search stops when all keys are covered by the image layer, remove
`#[allow(dead_code)]` once the code path is used in basebackups / aux
files, remove unnecessary fine-grained keyspace tracking in vectored get
(or have another code path for scan) to improve performance.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs    |   7 +-
 pageserver/src/metrics.rs         |  70 ++++++++++++++
 pageserver/src/tenant.rs          | 124 +++++++++++++++++++++++--
 pageserver/src/tenant/timeline.rs | 148 ++++++++++++++++++++----------
 4 files changed, 290 insertions(+), 59 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index ea6115853e..2511de00d5 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -80,7 +80,7 @@ impl Key {
     }
 
     /// Get the range of metadata keys.
-    pub fn metadata_key_range() -> Range<Self> {
+    pub const fn metadata_key_range() -> Range<Self> {
         Key {
             field1: METADATA_KEY_BEGIN_PREFIX,
             field2: 0,
@@ -572,14 +572,17 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
+/// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
+pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key)
+    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
 }
 
 #[inline(always)]
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d8019b08e2..903bad34cc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -194,6 +194,11 @@ pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
 
+#[allow(dead_code)]
+pub(crate) struct ScanLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
 impl GetVectoredLatency {
     // Only these task types perform vectored gets. Filter all other tasks out to reduce total
     // cardinality of the metric.
@@ -204,6 +209,48 @@ impl GetVectoredLatency {
     }
 }
 
+impl ScanLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) struct ScanLatencyOngoingRecording<'a> {
+    parent: &'a Histogram,
+    start: std::time::Instant,
+}
+
+impl<'a> ScanLatencyOngoingRecording<'a> {
+    pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
+        let start = Instant::now();
+        ScanLatencyOngoingRecording { parent, start }
+    }
+
+    pub(crate) fn observe(self, throttled: Option<Duration>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = if let Some(throttled) = throttled {
+            elapsed.checked_sub(throttled)
+        } else {
+            Some(elapsed)
+        };
+        if let Some(ex_throttled) = ex_throttled {
+            self.parent.observe(ex_throttled.as_secs_f64());
+        } else {
+            use utils::rate_limit::RateLimit;
+            static LOGGED: Lazy<Mutex<RateLimit>> =
+                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+            let mut rate_limit = LOGGED.lock().unwrap();
+            rate_limit.call(|| {
+                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+            });
+        }
+    }
+}
+
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
@@ -227,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
     }
 });
 
+pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_scan_seconds",
+        "Time spent in scan, excluding time spent in timeline_get_throttle.",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    ScanLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8fa484e7b2..c39c21c6dd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3925,7 +3925,7 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
-    use pageserver_api::key::NON_INHERITED_RANGE;
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
@@ -4791,15 +4791,7 @@ mod tests {
             .await;
 
         let images = vectored_res?;
-        let mut key = NON_INHERITED_RANGE.start;
-        while key < NON_INHERITED_RANGE.end {
-            assert!(matches!(
-                images[&key],
-                Err(PageReconstructError::MissingKey(_))
-            ));
-            key = key.next();
-        }
-
+        assert!(images.is_empty());
         Ok(())
     }
 
@@ -5500,4 +5492,116 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_metadata_scan() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_scan")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+
+        // Track when each page was last modified. Used to assert that
+        // a read sees the latest page version.
+        let mut updated = [Lsn(0); NUM_KEYS];
+
+        let mut lsn = Lsn(0x10);
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            updated[blknum] = lsn;
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for _ in 0..10 {
+            // Read all the blocks
+            for (blknum, last_lsn) in updated.iter().enumerate() {
+                test_key.field6 = (blknum * STEP) as u32;
+                assert_eq!(
+                    tline.get(test_key, lsn, &ctx).await?,
+                    test_img(&format!("{} at {}", blknum, last_lsn))
+                );
+            }
+
+            let mut cnt = 0;
+            for (key, value) in tline
+                .get_vectored_impl(
+                    keyspace.clone(),
+                    lsn,
+                    ValuesReconstructState::default(),
+                    &ctx,
+                )
+                .await?
+            {
+                let blknum = key.field6 as usize;
+                let value = value?;
+                assert!(blknum % STEP == 0);
+                let blknum = blknum / STEP;
+                assert_eq!(
+                    value,
+                    test_img(&format!("{} at {}", blknum, updated[blknum]))
+                );
+                cnt += 1;
+            }
+
+            assert_eq!(cnt, NUM_KEYS);
+
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+                updated[blknum] = lsn;
+            }
+
+            // Perform a cycle of flush, compact, and GC
+            let cutoff = tline.get_last_record_lsn();
+            tline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
+                .await?;
+            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
+            tline.gc().await?;
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 19228bc1f1..c7a5598cec 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,10 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
+    key::{
+        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
+    },
     keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -55,7 +58,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
@@ -77,6 +79,9 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{
+    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
+};
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
@@ -885,16 +890,15 @@ impl Timeline {
                             value
                         }
                     }
-                    None => {
-                        error!(
-                            "Expected {}, but singular vectored get returned nothing",
-                            key
-                        );
-                        Err(PageReconstructError::Other(anyhow!(
-                            "Singular vectored get did not return a value for {}",
-                            key
-                        )))
-                    }
+                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn: Lsn(0),
+                        request_lsn: lsn,
+                        ancestor_lsn: None,
+                        traversal_path: Vec::new(),
+                        backtrace: None,
+                    })),
                 }
             }
         }
@@ -1044,6 +1048,70 @@ impl Timeline {
         res
     }
 
+    /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
+    /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found
+    /// during the search, but for the scan interface, it returns all existing key-value pairs, and does
+    /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB
+    /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
+    /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
+    /// the scan operation will not cause OOM in the future.
+    #[allow(dead_code)]
+    pub(crate) async fn scan(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if !lsn.is_valid() {
+            return Err(GetVectoredError::InvalidLsn(lsn));
+        }
+
+        trace!(
+            "key-value scan request for {:?}@{} from task kind {:?}",
+            keyspace,
+            lsn,
+            ctx.task_kind()
+        );
+
+        // We should generalize this into Keyspace::contains in the future.
+        for range in &keyspace.ranges {
+            if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
+                || range.end.field1 >= METADATA_KEY_END_PREFIX
+            {
+                return Err(GetVectoredError::Other(anyhow::anyhow!(
+                    "only metadata keyspace can be scanned"
+                )));
+            }
+        }
+
+        let start = crate::metrics::SCAN_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(ScanLatencyOngoingRecording::start_recording);
+
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            // assume scan = 1 quota for now until we find a better way to process this
+            .throttle(ctx, 1)
+            .await;
+
+        let vectored_res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                ValuesReconstructState::default(),
+                ctx,
+            )
+            .await;
+
+        if let Some(recording) = start {
+            recording.observe(throttled);
+        }
+
+        vectored_res
+    }
+
     /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn get_vectored_sequential_impl(
         &self,
@@ -1052,6 +1120,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let mut values = BTreeMap::new();
+
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
@@ -1064,12 +1133,16 @@ impl Timeline {
                     Err(Cancelled | AncestorStopping(_)) => {
                         return Err(GetVectoredError::Cancelled)
                     }
-                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
-                    Err(MissingKey(err)) if !NON_INHERITED_RANGE.contains(&key) => {
-                        // The vectored read path handles non inherited keys specially.
-                        // If such a a key cannot be reconstructed from the current timeline,
-                        // the vectored read path returns a key level error as opposed to a top
-                        // level error.
+                    Err(MissingKey(_))
+                        if NON_INHERITED_RANGE.contains(&key)
+                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
+                    {
+                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
+                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
+                        // accordingly.
+                        key = key.next();
+                    }
+                    Err(MissingKey(err)) => {
                         return Err(GetVectoredError::MissingKey(err));
                     }
                     Err(Other(err))
@@ -1157,6 +1230,11 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) {
+        if keyspace.overlaps(&Key::metadata_key_range()) {
+            // skip validation for metadata key range
+            return;
+        }
+
         let sequential_res = self
             .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
             .await;
@@ -3209,36 +3287,12 @@ impl Timeline {
             // Do not descend into the ancestor timeline for aux files.
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
-            // TODO(chi): this will need to be updated for aux files v2 storage
-            if keyspace.overlaps(&NON_INHERITED_RANGE) {
-                let removed = keyspace.remove_overlapping_with(&KeySpace {
-                    ranges: vec![NON_INHERITED_RANGE],
-                });
-
-                for range in removed.ranges {
-                    let mut key = range.start;
-                    while key < range.end {
-                        reconstruct_state.on_key_error(
-                            key,
-                            PageReconstructError::MissingKey(MissingKeyError {
-                                key,
-                                shard: self.shard_identity.get_shard_number(&key),
-                                cont_lsn,
-                                request_lsn,
-                                ancestor_lsn: None,
-                                traversal_path: Vec::default(),
-                                backtrace: if cfg!(test) {
-                                    Some(std::backtrace::Backtrace::force_capture())
-                                } else {
-                                    None
-                                },
-                            }),
-                        );
-                        key = key.next();
-                    }
-                }
-            }
+            keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+            });
 
+            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
+            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
             if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }

From 9b65946566fc4e4b095cacab56f1843e0679eda0 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 3 May 2024 16:45:24 +0100
Subject: [PATCH 149/157] proxy: add connect compute concurrency lock (#7607)

## Problem

Too many connect_compute attempts can overwhelm postgres, getting the
connections stuck.

## Summary of changes

Limit number of connection attempts that can happen at a given time.
---
 proxy/src/bin/proxy.rs             | 46 +++++++++++++++++++++---------
 proxy/src/compute.rs               | 11 +++++++
 proxy/src/config.rs                | 18 ++++++++----
 proxy/src/console/provider.rs      | 13 ++++-----
 proxy/src/console/provider/neon.rs |  8 +++---
 proxy/src/lib.rs                   |  3 ++
 proxy/src/metrics.rs               |  9 ++++++
 proxy/src/proxy.rs                 |  5 +++-
 proxy/src/proxy/connect_compute.rs |  8 +++++-
 proxy/src/serverless/backend.rs    | 26 +++++++++++++++--
 10 files changed, 112 insertions(+), 35 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 39f6bc8b6d..0956aae6c0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -118,8 +118,11 @@ struct ProxyCliArgs {
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     wake_compute_cache: String,
     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
     wake_compute_lock: String,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
@@ -529,24 +532,21 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                 endpoint_cache_config,
             )));
 
-            let config::WakeComputeLockOptions {
+            let config::ConcurrencyLockOptions {
                 shards,
                 permits,
                 epoch,
                 timeout,
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new(
-                    "wake_compute_lock",
-                    permits,
-                    shards,
-                    timeout,
-                    epoch,
-                    &Metrics::get().wake_compute_lock,
-                )
-                .unwrap(),
-            ));
+            let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
+                "wake_compute_lock",
+                permits,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
             tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
@@ -572,6 +572,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
+
+    let config::ConcurrencyLockOptions {
+        shards,
+        permits,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
+    let connect_compute_locks = console::locks::ApiLocks::new(
+        "connect_compute_lock",
+        permits,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
     let http_config = HttpConfig {
         request_timeout: args.sql_over_http.sql_over_http_timeout,
         pool_options: GlobalConnPoolOptions {
@@ -607,11 +624,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         region: args.region.clone(),
         aws_region: args.aws_region.clone(),
         wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_compute_locks,
         connect_to_compute_retry_config: config::RetryConfig::parse(
             &args.connect_to_compute_retry,
         )?,
     }));
 
+    tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
+
     Ok(config)
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 44d85c2952..23266ac4ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,7 @@ use crate::{
     error::{ReportableError, UserFacingError},
     metrics::{Metrics, NumDbConnectionsGuard},
     proxy::neon_option,
+    Host,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -101,6 +102,16 @@ impl ConnCfg {
         }
     }
 
+    pub fn get_host(&self) -> Result<Host, WakeComputeError> {
+        match self.0.get_hosts() {
+            [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
+            // we should not have multiple address or unix addresses.
+            _ => Err(WakeComputeError::BadComputeAddress(
+                "invalid compute address".into(),
+            )),
+        }
+    }
+
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index a32ab8c43c..0c8e284d0b 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,7 +1,9 @@
 use crate::{
     auth::{self, backend::AuthRateLimiter},
+    console::locks::ApiLocks,
     rate_limiter::RateBucketInfo,
     serverless::GlobalConnPoolOptions,
+    Host,
 };
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
@@ -34,6 +36,7 @@ pub struct ProxyConfig {
     pub handshake_timeout: Duration,
     pub aws_region: String,
     pub wake_compute_retry_config: RetryConfig,
+    pub connect_compute_locks: ApiLocks<Host>,
     pub connect_to_compute_retry_config: RetryConfig,
 }
 
@@ -573,7 +576,7 @@ impl RetryConfig {
 }
 
 /// Helper for cmdline cache options parsing.
-pub struct WakeComputeLockOptions {
+pub struct ConcurrencyLockOptions {
     /// The number of shards the lock map should have
     pub shards: usize,
     /// The number of allowed concurrent requests for each endpoitn
@@ -584,9 +587,12 @@ pub struct WakeComputeLockOptions {
     pub timeout: Duration,
 }
 
-impl WakeComputeLockOptions {
+impl ConcurrencyLockOptions {
     /// Default options for [`crate::console::provider::ApiLocks`].
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
+    /// Default options for [`crate::console::provider::ApiLocks`].
+    pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
+        "shards=64,permits=50,epoch=10m,timeout=500ms";
 
     // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
 
@@ -636,7 +642,7 @@ impl WakeComputeLockOptions {
     }
 }
 
-impl FromStr for WakeComputeLockOptions {
+impl FromStr for ConcurrencyLockOptions {
     type Err = anyhow::Error;
 
     fn from_str(options: &str) -> Result<Self, Self::Err> {
@@ -672,7 +678,7 @@ mod tests {
 
     #[test]
     fn test_parse_lock_options() -> anyhow::Result<()> {
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
@@ -683,7 +689,7 @@ mod tests {
         assert_eq!(shards, 32);
         assert_eq!(permits, 4);
 
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
@@ -694,7 +700,7 @@ mod tests {
         assert_eq!(shards, 16);
         assert_eq!(permits, 8);
 
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index aa1800a9da..dfda29e0b1 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -17,7 +17,7 @@ use crate::{
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{sync::Arc, time::Duration};
+use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -447,16 +447,16 @@ impl ApiCaches {
 }
 
 /// Various caches for [`console`](super).
-pub struct ApiLocks {
+pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
+    node_locks: DashMap<K, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
     epoch: std::time::Duration,
     metrics: &'static ApiLockMetrics,
 }
 
-impl ApiLocks {
+impl<K: Hash + Eq + Clone> ApiLocks<K> {
     pub fn new(
         name: &'static str,
         permits: usize,
@@ -475,10 +475,7 @@ impl ApiLocks {
         })
     }
 
-    pub async fn get_wake_compute_permit(
-        &self,
-        key: &EndpointCacheKey,
-    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
+    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
         if self.permits == 0 {
             return Ok(WakeComputePermit { permit: None });
         }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 58b2a1570c..ec66641d01 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
-    scram, Normalize,
+    scram, EndpointCacheKey, Normalize,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -25,7 +25,7 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    pub locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks<EndpointCacheKey>,
     pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
@@ -35,7 +35,7 @@ impl Api {
     pub fn new(
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
-        locks: &'static ApiLocks,
+        locks: &'static ApiLocks<EndpointCacheKey>,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
@@ -289,7 +289,7 @@ impl super::Api for Api {
             return Err(WakeComputeError::TooManyConnections);
         }
 
-        let permit = self.locks.get_wake_compute_permit(&key).await?;
+        let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
         // double check
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 3f6d985fe8..35c1616481 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -159,6 +159,9 @@ smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
 
+// postgres hostname, will likely be a port:ip addr
+smol_str_wrapper!(Host);
+
 // Endpoints are a bit tricky. Rare they might be branches or projects.
 impl EndpointId {
     pub fn is_endpoint(&self) -> bool {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 4a54857012..1590316925 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -126,6 +126,9 @@ pub struct ProxyMetrics {
 
     /// Number of events consumed from redis (per event type).
     pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
+
+    #[metric(namespace = "connect_compute_lock")]
+    pub connect_compute_lock: ApiLockMetrics,
 }
 
 #[derive(MetricGroup)]
@@ -149,6 +152,12 @@ impl Default for ProxyMetrics {
     }
 }
 
+impl Default for ApiLockMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 #[label(singleton = "direction")]
 pub enum HttpDirection {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 33d73eb675..e4e095d77d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -301,7 +301,10 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let mut node = connect_to_compute(
         ctx,
-        &TcpMechanism { params: &params },
+        &TcpMechanism {
+            params: &params,
+            locks: &config.connect_compute_locks,
+        },
         &user_info,
         mode.allow_self_signed_compute(config),
         config.wake_compute_retry_config,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index da6223209f..c8528d0296 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -2,7 +2,7 @@ use crate::{
     auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
     config::RetryConfig,
-    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
+    console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
     metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
@@ -10,6 +10,7 @@ use crate::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
     },
+    Host,
 };
 use async_trait::async_trait;
 use pq_proto::StartupMessageParams;
@@ -64,6 +65,9 @@ pub trait ComputeConnectBackend {
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
+
+    /// connect_to_compute concurrency lock
+    pub locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
@@ -79,6 +83,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
+        let host = node_info.config.get_host()?;
+        let _permit = self.locks.get_permit(&host).await?;
         node_info.connect(ctx, timeout).await
     }
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index c89ebc3251..963913a260 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,11 +9,13 @@ use crate::{
     config::{AuthenticationConfig, ProxyConfig},
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
+        locks::ApiLocks,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
-    proxy::connect_compute::ConnectMechanism,
+    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    Host,
 };
 
 use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
@@ -105,6 +107,7 @@ impl PoolingBackend {
                 conn_id,
                 conn_info,
                 pool: self.pool.clone(),
+                locks: &self.config.connect_compute_locks,
             },
             &backend,
             false, // do not allow self signed compute for http flow
@@ -154,16 +157,31 @@ impl UserFacingError for HttpConnError {
     }
 }
 
+impl ShouldRetry for HttpConnError {
+    fn could_retry(&self) -> bool {
+        match self {
+            HttpConnError::ConnectionError(e) => e.could_retry(),
+            HttpConnError::ConnectionClosedAbruptly(_) => false,
+            HttpConnError::GetAuthInfo(_) => false,
+            HttpConnError::AuthError(_) => false,
+            HttpConnError::WakeCompute(_) => false,
+        }
+    }
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
+
+    /// connect_to_compute concurrency lock
+    locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
 impl ConnectMechanism for TokioMechanism {
     type Connection = Client<tokio_postgres::Client>;
-    type ConnectError = tokio_postgres::Error;
+    type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
     async fn connect_once(
@@ -172,6 +190,9 @@ impl ConnectMechanism for TokioMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
+        let host = node_info.config.get_host()?;
+        let permit = self.locks.get_permit(&host).await?;
+
         let mut config = (*node_info.config).clone();
         let config = config
             .user(&self.conn_info.user_info.user)
@@ -182,6 +203,7 @@ impl ConnectMechanism for TokioMechanism {
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
         drop(pause);
+        drop(permit);
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(

From ef03b38e5282140a5b7003c7f5010e1707631f31 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 3 May 2024 12:01:33 -0400
Subject: [PATCH 150/157] fix(pageserver): remove update_gc_info calls in tests
 (#7608)

introduced by https://github.com/neondatabase/neon/pull/7468 conflicting
with https://github.com/neondatabase/neon/pull/7584

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c39c21c6dd..fdc49ae295 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5504,6 +5504,8 @@ mod tests {
         const NUM_KEYS: usize = 1000;
         const STEP: usize = 100; // random update + scan base_key + idx * STEP
 
+        let cancel = CancellationToken::new();
+
         let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
         base_key.field1 = AUX_KEY_PREFIX;
         let mut test_key = base_key;
@@ -5585,21 +5587,11 @@ mod tests {
             }
 
             // Perform a cycle of flush, compact, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
-            tline.gc().await?;
         }
 
         Ok(())

From 1e7cd6ac9f3568ffe9db952cb89f8036330d27b5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 3 May 2024 19:15:38 +0200
Subject: [PATCH 151/157] refactor: move `NodeMetadata` to `pageserver_api`;
 use it from `neon_local` (#7606)

This is the first step towards representing all of Pageserver
configuration as clean `serde::Serialize`able Rust structs in
`pageserver_api`.

The `neon_local` code will then use those structs instead of the crude
`toml_edit` / string concatenation that it does today.

refs https://github.com/neondatabase/neon/issues/7555

---------

Co-authored-by: Alex Chi Z <iskyzh@gmail.com>
---
 control_plane/src/bin/neon_local.rs     |  8 +++----
 control_plane/src/pageserver.rs         | 13 ++++++-----
 libs/pageserver_api/src/config.rs       | 31 +++++++++++++++++++++++++
 libs/pageserver_api/src/config/tests.rs | 22 ++++++++++++++++++
 libs/pageserver_api/src/lib.rs          |  6 +----
 pageserver/src/config.rs                | 24 ++-----------------
 pageserver/src/control_plane_client.rs  |  6 ++---
 7 files changed, 69 insertions(+), 41 deletions(-)
 create mode 100644 libs/pageserver_api/src/config.rs
 create mode 100644 libs/pageserver_api/src/config/tests.rs

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index bdd64c8687..14b83c1252 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,15 +14,15 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
+use pageserver_api::config::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
-use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
-};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::{
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 52accc5890..1a64391306 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -248,12 +248,13 @@ impl PageServerNode {
         // situation: the metadata is written by some other script.
         std::fs::write(
             metadata_path,
-            serde_json::to_vec(&serde_json::json!({
-                "host": "localhost",
-                "port": self.pg_connection_config.port(),
-                "http_host": "localhost",
-                "http_port": http_port,
-            }))
+            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
+                postgres_host: "localhost".to_string(),
+                postgres_port: self.pg_connection_config.port(),
+                http_host: "localhost".to_string(),
+                http_port,
+                other: HashMap::new(),
+            })
             .unwrap(),
         )
         .expect("Failed to write metadata file");
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
new file mode 100644
index 0000000000..d996a62349
--- /dev/null
+++ b/libs/pageserver_api/src/config.rs
@@ -0,0 +1,31 @@
+use std::collections::HashMap;
+
+use const_format::formatcp;
+
+#[cfg(test)]
+mod tests;
+
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+
+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
+pub struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub postgres_host: String,
+    #[serde(rename = "port")]
+    pub postgres_port: u16,
+    pub http_host: String,
+    pub http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub other: HashMap<String, serde_json::Value>,
+}
diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs
new file mode 100644
index 0000000000..edeefc156e
--- /dev/null
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -0,0 +1,22 @@
+use super::*;
+
+#[test]
+fn test_node_metadata_v1_backward_compatibilty() {
+    let v1 = serde_json::to_vec(&serde_json::json!({
+        "host": "localhost",
+        "port": 23,
+        "http_host": "localhost",
+        "http_port": 42,
+    }));
+
+    assert_eq!(
+        serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
+        NodeMetadata {
+            postgres_host: "localhost".to_string(),
+            postgres_port: 23,
+            http_host: "localhost".to_string(),
+            http_port: 42,
+            other: HashMap::new(),
+        }
+    )
+}
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 1b948d60c3..532185a366 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
-use const_format::formatcp;
 
 pub mod controller_api;
 pub mod key;
@@ -11,7 +10,4 @@ pub mod shard;
 /// Public API types
 pub mod upcall_api;
 
-pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+pub mod config;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 96fff1f0c0..258eed0b12 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
-use std::{collections::HashMap, env};
+use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -51,7 +51,7 @@ pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
 
-    pub use pageserver_api::{
+    pub use pageserver_api::config::{
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
@@ -335,26 +335,6 @@ impl<T: Clone> BuilderValue<T> {
     }
 }
 
-// Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
-// itself, it is only used for registering the pageserver with the control
-// plane and/or storage controller.
-//
-#[derive(serde::Deserialize)]
-pub(crate) struct NodeMetadata {
-    #[serde(rename = "host")]
-    pub(crate) postgres_host: String,
-    #[serde(rename = "port")]
-    pub(crate) postgres_port: u16,
-    pub(crate) http_host: String,
-    pub(crate) http_port: u16,
-
-    // Deployment tools may write fields to the metadata file beyond what we
-    // use in this type: this type intentionally only names fields that require.
-    #[serde(flatten)]
-    pub(crate) other: HashMap<String, serde_json::Value>,
-}
-
 // needed to simplify config construction
 #[derive(Default)]
 struct PageServerConfigBuilder {
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index db0032891e..26e7cc7ef8 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 
-use crate::{
-    config::{NodeMetadata, PageServerConf},
-    virtual_file::on_fatal_io_error,
-};
+use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
+use pageserver_api::config::NodeMetadata;
 
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)

From 64f0613edf47a6975f5d6394e5056cf2eaf7e484 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 3 May 2024 12:57:45 -0700
Subject: [PATCH 152/157] compute_ctl: Add support for swap resizing (#7434)

Part of neondatabase/cloud#12047. Resolves #7239.

In short, this PR:

1. Adds `ComputeSpec.swap_size_bytes: Option<u64>`
2. Adds a flag to compute_ctl: `--resize-swap-on-bind`
3. Implements running `/neonvm/bin/resize-swap` with the value from the
   compute spec before starting postgres, if both the value in the spec
   *AND* the flag are specified.
4. Adds `sudo` to the final image
5. Adds a file in `/etc/sudoers.d` to allow `compute_ctl` to resize swap

Various bits of reasoning about design decisions in the added comments.
In short: We have both a compute spec field and a flag to make rollout
easier to implement. The flag will most likely be removed as part of
cleanups for neondatabase/cloud#12047.
---
 compute_tools/src/bin/compute_ctl.rs | 86 +++++++++++++++++++++-------
 compute_tools/src/lib.rs             |  1 +
 compute_tools/src/swap.rs            | 36 ++++++++++++
 control_plane/src/endpoint.rs        |  1 +
 libs/compute_api/src/spec.rs         | 17 ++++++
 vm-image-spec.yaml                   | 22 +++++++
 6 files changed, 143 insertions(+), 20 deletions(-)
 create mode 100644 compute_tools/src/swap.rs

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 117919786e..471d46d4f2 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -47,7 +47,7 @@ use chrono::Utc;
 use clap::Arg;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
@@ -62,6 +62,7 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
+use compute_tools::swap::resize_swap;
 
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -110,6 +111,7 @@ fn main() -> Result<()> {
         .expect("Postgres connection string is required");
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
+    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
 
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
@@ -275,33 +277,72 @@ fn main() -> Result<()> {
         "running compute with features: {:?}",
         state.pspec.as_ref().unwrap().spec.features
     );
+    // before we release the mutex, fetch the swap size (if any) for later.
+    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
     drop(state);
 
     // Launch remaining service threads
     let _monitor_handle = launch_monitor(&compute);
     let _configurator_handle = launch_configurator(&compute);
 
-    // Start Postgres
+    let mut prestartup_failed = false;
     let mut delay_exit = false;
-    let mut exit_code = None;
-    let pg = match compute.start_compute(extension_server_port) {
-        Ok(pg) => Some(pg),
-        Err(err) => {
-            error!("could not start the compute node: {:#}", err);
-            let mut state = compute.state.lock().unwrap();
-            state.error = Some(format!("{:?}", err));
-            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
-            delay_exit = true;
-            None
+
+    // Resize swap to the desired size if the compute spec says so
+    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
+        // *before* starting postgres.
+        //
+        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
+        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
+        // OOM-killed during startup because swap wasn't available yet.
+        match resize_swap(size_bytes) {
+            Ok(()) => {
+                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_gib, "resized swap");
+            }
+            Err(err) => {
+                let err = err.context("failed to resize swap");
+                error!("{err:#}");
+
+                // Mark compute startup as failed; don't try to start postgres, and report this
+                // error to the control plane when it next asks.
+                prestartup_failed = true;
+                let mut state = compute.state.lock().unwrap();
+                state.error = Some(format!("{err:?}"));
+                state.status = ComputeStatus::Failed;
+                compute.state_changed.notify_all();
+                delay_exit = true;
+            }
         }
-    };
+    }
+
+    // Start Postgres
+    let mut pg = None;
+    let mut exit_code = None;
+
+    if !prestartup_failed {
+        pg = match compute.start_compute(extension_server_port) {
+            Ok(pg) => Some(pg),
+            Err(err) => {
+                error!("could not start the compute node: {:#}", err);
+                let mut state = compute.state.lock().unwrap();
+                state.error = Some(format!("{:?}", err));
+                state.status = ComputeStatus::Failed;
+                // Notify others that Postgres failed to start. In case of configuring the
+                // empty compute, it's likely that API handler is still waiting for compute
+                // state change. With this we will notify it that compute is in Failed state,
+                // so control plane will know about it earlier and record proper error instead
+                // of timeout.
+                compute.state_changed.notify_all();
+                drop(state); // unlock
+                delay_exit = true;
+                None
+            }
+        };
+    } else {
+        warn!("skipping postgres startup because pre-startup step failed");
+    }
 
     // Start the vm-monitor if directed to. The vm-monitor only runs on linux
     // because it requires cgroups.
@@ -526,6 +567,11 @@ fn cli() -> clap::Command {
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
+        .arg(
+            Arg::new("resize-swap-on-bind")
+                .long("resize-swap-on-bind")
+                .action(clap::ArgAction::SetTrue),
+        )
 }
 
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 4e01ffd954..eac808385c 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -14,4 +14,5 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod swap;
 pub mod sync_sk;
diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs
new file mode 100644
index 0000000000..c22b6bc14e
--- /dev/null
+++ b/compute_tools/src/swap.rs
@@ -0,0 +1,36 @@
+use anyhow::{anyhow, Context};
+use tracing::warn;
+
+pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
+
+pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
+    // run `/neonvm/bin/resize-swap --once {size_bytes}`
+    //
+    // Passing '--once' causes resize-swap to delete itself after successful completion, which
+    // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
+    // postgres is running.
+    //
+    // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
+    let child_result = std::process::Command::new("/usr/bin/sudo")
+        .arg(RESIZE_SWAP_BIN)
+        .arg("--once")
+        .arg(size_bytes.to_string())
+        .spawn();
+
+    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
+        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+        return Ok(());
+    }
+
+    child_result
+        .context("spawn() failed")
+        .and_then(|mut child| child.wait().context("wait() failed"))
+        .and_then(|status| match status.success() {
+            true => Ok(()),
+            false => Err(anyhow!("process exited with {status}")),
+        })
+        // wrap any prior error with the overall context that we couldn't run the command
+        .with_context(|| {
+            format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
+        })
+}
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 03f7db99fb..20371e1cb8 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -554,6 +554,7 @@ impl Endpoint {
             format_version: 1.0,
             operation_uuid: None,
             features: self.features.clone(),
+            swap_size_bytes: None,
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 71ae66c45c..1c4ee2089f 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -33,6 +33,23 @@ pub struct ComputeSpec {
     #[serde(default)]
     pub features: Vec<ComputeFeature>,
 
+    /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
+    /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
+    /// received.
+    ///
+    /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
+    /// spec generation doesn't need to be aware of the actual compute it's running on, while
+    /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
+    /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
+    /// giving every VM much more swap than it should have (32GiB).
+    ///
+    /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
+    /// enabling the swap resizing behavior once rollout is complete.
+    ///
+    /// See neondatabase/cloud#12047 for more.
+    #[serde(default)]
+    pub swap_size_bytes: Option<u64>,
+
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 3ccdf5cc64..41ca16f16b 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -5,6 +5,12 @@ commands:
     user: root
     sysvInitAction: sysinit
     shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
@@ -24,6 +30,11 @@ commands:
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
+  - filename: compute_ctl-resize-swap
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
   - filename: pgbouncer.ini
     content: |
       [databases]
@@ -353,6 +364,17 @@ merge: |
       && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
          )
 
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
+
   COPY cgconfig.conf /etc/cgconfig.conf
   COPY pgbouncer.ini /etc/pgbouncer.ini
   COPY sql_exporter.yml /etc/sql_exporter.yml

From 4deb8dc52ec26ab59a4d0b7ff548ef389e6717f9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 4 May 2024 08:44:18 +0300
Subject: [PATCH 153/157] compute_ctl: Be more precise in how startup time is
 calculated (#7601)

- On a non-pooled start, do not reset the 'start_time' after launching
the HTTP service. In a non-pooled start, it's fair to include that in
the total startup time.

- When setting wait_for_spec_ms and resetting start_time, call
Utc::now() only once. It's a waste of cycles to call it twice, but also,
it ensures the time between setting wait_for_spec_ms and resetting
start_time is included in one or the other time period.

These differences should be insignificant in practice, in the
microsecond range, but IMHO it seems more logical and readable this way
too. Also fix and clarify some of the surrounding comments.

(This caught my eye while reviewing PR #7577)
---
 compute_tools/src/bin/compute_ctl.rs | 29 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 471d46d4f2..67c5250376 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -228,14 +228,14 @@ fn main() -> Result<()> {
 
     // If this is a pooled VM, prewarm before starting HTTP server and becoming
     // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have it's memory allocated from the host, and
+    // because QEMU will already have its memory allocated from the host, and
     // the necessary binaries will already be cached.
     if !spec_set {
         compute.prewarm_postgres()?;
     }
 
-    // Launch http service first, so we were able to serve control-plane
-    // requests, while configuration is still in progress.
+    // Launch http service first, so that we can serve control-plane requests
+    // while configuration is still in progress.
     let _http_handle =
         launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
 
@@ -255,21 +255,22 @@ fn main() -> Result<()> {
                 break;
             }
         }
+
+        // Record for how long we slept waiting for the spec.
+        let now = Utc::now();
+        state.metrics.wait_for_spec_ms = now
+            .signed_duration_since(state.start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // Reset start time, so that the total startup time that is calculated later will
+        // not include the time that we waited for the spec.
+        state.start_time = now;
     }
 
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
-
-    // Record for how long we slept waiting for the spec.
-    state.metrics.wait_for_spec_ms = Utc::now()
-        .signed_duration_since(state.start_time)
-        .to_std()
-        .unwrap()
-        .as_millis() as u64;
-    // Reset start time to the actual start of the configuration, so that
-    // total startup time was properly measured at the end.
-    state.start_time = Utc::now();
-
     state.status = ComputeStatus::Init;
     compute.state_changed.notify_all();
 

From 5da3e2113ad309e50b784a96d08b437e201cde06 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 17:59:41 +0300
Subject: [PATCH 154/157] Allow bad state (not active) pageserver error/warns
 in walcraft test.

The top reason for it being flaky.
---
 test_runner/regress/test_crafted_wal_end.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 01ecc2b95f..30f8d81890 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_crafted_wal_end")
+    env.pageserver.allowed_errors.extend(
+        [
+            # seems like pageserver stop triggers these
+            ".*initial size calculation failed.*Bad state (not active).*",
+        ]
+    )
 
     endpoint = env.endpoints.create("test_crafted_wal_end")
     wal_craft = WalCraft(env)

From ce4d3da3ae2d83f8a4dc632112c95580a2a25fbe Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 1 May 2024 18:22:34 +0300
Subject: [PATCH 155/157] Properly initialize first WAL segment on safekeepers.

Previously its segment header and page header of first record weren't
initialized because compute streams data only since first record LSN. Also, fix
a bug in the existing code for initialization: xlp_rem_len must not include page
header.

These changes make first segment pg_waldump'able.
---
 libs/postgres_ffi/src/xlog_utils.rs           | 39 ++++++++++++-------
 safekeeper/src/safekeeper.rs                  | 16 ++++++++
 safekeeper/src/wal_storage.rs                 | 24 ++++++++++++
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 ++
 4 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 4a66a0ab1d..0bbb91afc2 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -331,7 +331,10 @@ impl CheckPoint {
     /// Returns 'true' if the XID was updated.
     pub fn update_next_xid(&mut self, xid: u32) -> bool {
         // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        let mut new_xid = std::cmp::max(
+            xid.wrapping_add(1),
+            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
+        );
         // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
         // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
         new_xid =
@@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
     let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
 
     let first_page_only = seg_off < XLOG_BLCKSZ;
-    let (shdr_rem_len, infoflags) = if first_page_only {
-        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    // If first records starts in the middle of the page, pretend in page header
+    // there is a fake record which ends where first real record starts. This
+    // makes pg_waldump etc happy.
+    let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
+        assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
+        // xlp_rem_len doesn't include page header, hence the subtraction.
+        (
+            seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
+            pg_constants::XLP_FIRST_IS_CONTRECORD,
+        )
     } else {
         (0, 0)
     };
@@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
     if !first_page_only {
         let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        // see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
+        let (xlp_rem_len, xlp_info) = if page_off > 0 {
+            assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+            (
+                (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
+                pg_constants::XLP_FIRST_IS_CONTRECORD,
+            )
+        } else {
+            (0, 0)
+        };
         let header = XLogPageHeaderData {
             xlp_magic: XLOG_PAGE_MAGIC as u16,
-            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
-                pg_constants::XLP_FIRST_IS_CONTRECORD
-            } else {
-                0
-            },
+            xlp_info,
             xlp_tli: PG_TLI,
             xlp_pageaddr: lsn.page_lsn().0,
-            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
-                page_off as u32
-            } else {
-                0u32
-            },
+            xlp_rem_len,
             ..Default::default() // Put 0 in padding fields.
         };
         let hdr_bytes = header.encode()?;
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index f2ee0403eb..e671d4f36a 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -725,6 +725,18 @@ where
             self.state.inmem.commit_lsn
         );
 
+        // Before first WAL write initialize its segment. It makes first segment
+        // pg_waldump'able because stream from compute doesn't include its
+        // segment and page headers.
+        //
+        // If we fail before first WAL write flush this action would be
+        // repeated, that's ok because it is idempotent.
+        if self.wal_store.flush_lsn() == Lsn::INVALID {
+            self.wal_store
+                .initialize_first_segment(msg.start_streaming_at)
+                .await?;
+        }
+
         // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
         // intersection of our history and history from msg
 
@@ -1007,6 +1019,10 @@ mod tests {
             self.lsn
         }
 
+        async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
+            Ok(())
+        }
+
         async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
             self.lsn = startpos + buf.len() as u64;
             Ok(())
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 147f318b9f..6bc8c7c3f9 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -38,6 +38,12 @@ pub trait Storage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
+    /// Initialize segment by creating proper long header at the beginning of
+    /// the segment and short header at the page of given LSN. This is only used
+    /// for timeline initialization because compute will stream data only since
+    /// init_lsn. Other segment headers are included in compute stream.
+    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
+
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
 
@@ -78,6 +84,8 @@ pub struct PhysicalStorage {
 
     /// Size of WAL segment in bytes.
     wal_seg_size: usize,
+    pg_version: u32,
+    system_id: u64,
 
     /// Written to disk, but possibly still in the cache and not fully persisted.
     /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
@@ -169,6 +177,8 @@ impl PhysicalStorage {
             timeline_dir,
             conf: conf.clone(),
             wal_seg_size,
+            pg_version: state.server.pg_version,
+            system_id: state.server.system_id,
             write_lsn,
             write_record_lsn: write_lsn,
             flush_record_lsn: flush_lsn,
@@ -324,6 +334,20 @@ impl Storage for PhysicalStorage {
         self.flush_record_lsn
     }
 
+    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
+        let segno = init_lsn.segment_number(self.wal_seg_size);
+        let (mut file, _) = self.open_or_create(segno).await?;
+        let major_pg_version = self.pg_version / 10000;
+        let wal_seg =
+            postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?;
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&wal_seg).await?;
+        file.flush().await?;
+        info!("initialized segno {} at lsn {}", segno, init_lsn);
+        // note: file is *not* fsynced
+        Ok(())
+    }
+
     /// Write WAL to disk.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
         // Disallow any non-sequential writes, which can result in gaps or overwrites.
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index 35bca325aa..c2db9de78a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -182,6 +182,10 @@ impl wal_storage::Storage for DiskWALStorage {
         self.flush_record_lsn
     }
 
+    async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
+        Ok(())
+    }
+
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
         if self.write_lsn != startpos {

From 0353a72a00887173f802ba044d169a4c278ea8f8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 17:43:31 +0300
Subject: [PATCH 156/157] pg_waldump segment on safekeeper in test_pg_waldump.

To test it as well.
---
 test_runner/regress/test_pg_waldump.py | 33 +++++++++++++++++++-------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
index 1973aa5952..8e80efd9ba 100644
--- a/test_runner/regress/test_pg_waldump.py
+++ b/test_runner/regress/test_pg_waldump.py
@@ -1,13 +1,28 @@
 import os
+import shutil
 
 from fixtures.neon_fixtures import NeonEnv, PgBin
 from fixtures.utils import subprocess_capture
 
 
+def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
+    # use special --ignore option to ignore the validation checks in pg_waldump
+    # this is necessary, because neon WAL files contain gap at the beginning
+    output_path, _, _ = subprocess_capture(
+        test_output_dir, [pg_waldump_path, "--ignore", segment_path]
+    )
+
+    with open(f"{output_path}.stdout", "r") as f:
+        stdout = f.read()
+        assert "ABORT" in stdout
+        assert "COMMIT" in stdout
+
+
 # Simple test to check that pg_waldump works with neon WAL files
 def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_pg_waldump", "empty")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
     endpoint = env.endpoints.create_start("test_pg_waldump")
 
     cur = endpoint.connect().cursor()
@@ -35,12 +50,12 @@ def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     assert endpoint.pgdata_dir
     wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
     pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
+    # check segment on compute
+    check_wal_segment(pg_waldump_path, wal_path, test_output_dir)
 
-    # use special --ignore option to ignore the validation checks in pg_waldump
-    # this is necessary, because neon WAL files contain gap at the beginning
-    output_path, _, _ = subprocess_capture(test_output_dir, [pg_waldump_path, "--ignore", wal_path])
-
-    with open(f"{output_path}.stdout", "r") as f:
-        stdout = f.read()
-        assert "ABORT" in stdout
-        assert "COMMIT" in stdout
+    # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix.
+    sk = env.safekeepers[0]
+    sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id)
+    non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001")
+    shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path)
+    check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir)

From e6da7e29ed0a550ec59686bea7e656e16b2f13e7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 2 May 2024 11:51:24 +0300
Subject: [PATCH 157/157] Add option allowing running multiple endpoints on the
 same branch.

This is used by safekeeper tests.
---
 control_plane/src/bin/neon_local.rs           | 28 +++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         | 22 +++++++++++++--
 .../regress/test_wal_acceptor_async.py        | 20 ++++++++++---
 3 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 14b83c1252..e01d5c9799 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -837,6 +837,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .copied()
                 .unwrap_or(false);
 
+            let allow_multiple = sub_args.get_flag("allow-multiple");
+
             let mode = match (lsn, hot_standby) {
                 (Some(lsn), false) => ComputeMode::Static(lsn),
                 (None, true) => ComputeMode::Replica,
@@ -854,7 +856,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 _ => {}
             }
 
-            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            if !allow_multiple {
+                cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            }
 
             cplane.new_endpoint(
                 &endpoint_id,
@@ -883,6 +887,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
             let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
 
+            let allow_multiple = sub_args.get_flag("allow-multiple");
+
             // If --safekeepers argument is given, use only the listed safekeeper nodes.
             let safekeepers =
                 if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -908,11 +914,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .cloned()
                 .unwrap_or_default();
 
-            cplane.check_conflicting_endpoints(
-                endpoint.mode,
-                endpoint.tenant_id,
-                endpoint.timeline_id,
-            )?;
+            if !allow_multiple {
+                cplane.check_conflicting_endpoints(
+                    endpoint.mode,
+                    endpoint.tenant_id,
+                    endpoint.timeline_id,
+                )?;
+            }
 
             let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
                 let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1444,6 +1452,12 @@ fn cli() -> Command {
         .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
         .required(false);
 
+    let allow_multiple = Arg::new("allow-multiple")
+        .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
+        .long("allow-multiple")
+        .action(ArgAction::SetTrue)
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1601,6 +1615,7 @@ fn cli() -> Command {
                     .arg(pg_version_arg.clone())
                     .arg(hot_standby_arg.clone())
                     .arg(update_catalog)
+                    .arg(allow_multiple.clone())
                 )
                 .subcommand(Command::new("start")
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1609,6 +1624,7 @@ fn cli() -> Command {
                     .arg(safekeepers_arg)
                     .arg(remote_ext_config_args)
                     .arg(create_test_user)
+                    .arg(allow_multiple.clone())
                 )
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90884ad7f8..240b6ee199 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1801,6 +1801,7 @@ class NeonCli(AbstractNeonCli):
         hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1824,6 +1825,8 @@ class NeonCli(AbstractNeonCli):
             args.extend(["--hot-standby", "true"])
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if allow_multiple:
+            args.extend(["--allow-multiple"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -1835,6 +1838,7 @@ class NeonCli(AbstractNeonCli):
         safekeepers: Optional[List[int]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1849,6 +1853,8 @@ class NeonCli(AbstractNeonCli):
             args.append(endpoint_id)
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if allow_multiple:
+            args.extend(["--allow-multiple"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -3299,6 +3305,7 @@ class Endpoint(PgProtocol):
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple: bool = False,
     ) -> "Endpoint":
         """
         Create a new Postgres endpoint.
@@ -3321,6 +3328,7 @@ class Endpoint(PgProtocol):
             pg_port=self.pg_port,
             http_port=self.http_port,
             pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
@@ -3337,7 +3345,10 @@ class Endpoint(PgProtocol):
         return self
 
     def start(
-        self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None
+        self,
+        remote_ext_config: Optional[str] = None,
+        pageserver_id: Optional[int] = None,
+        allow_multiple: bool = False,
     ) -> "Endpoint":
         """
         Start the Postgres instance.
@@ -3353,6 +3364,7 @@ class Endpoint(PgProtocol):
             safekeepers=self.active_safekeepers,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
         )
         self.running = True
 
@@ -3482,6 +3494,7 @@ class Endpoint(PgProtocol):
         config_lines: Optional[List[str]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "Endpoint":
         """
         Create an endpoint, apply config, and start Postgres.
@@ -3497,7 +3510,12 @@ class Endpoint(PgProtocol):
             hot_standby=hot_standby,
             lsn=lsn,
             pageserver_id=pageserver_id,
-        ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id)
+            allow_multiple=allow_multiple,
+        ).start(
+            remote_ext_config=remote_ext_config,
+            pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
+        )
 
         log.info(f"Postgres startup took {time.time() - started_at} seconds")
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 5902eb3217..dce5616ac6 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -254,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
     )
 
 
-def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
+def endpoint_create_start(
+    env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False
+):
     endpoint = Endpoint(
         env,
         tenant_id=env.initial_tenant,
@@ -268,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
     # embed current time in endpoint ID
     endpoint_id = pgdir_name or f"ep-{time.time()}"
     return endpoint.create_start(
-        branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"]
+        branch_name=branch,
+        endpoint_id=endpoint_id,
+        config_lines=["log_statement=all"],
+        allow_multiple=allow_multiple,
     )
 
 
 async def exec_compute_query(
-    env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None
+    env: NeonEnv,
+    branch: str,
+    query: str,
+    pgdir_name: Optional[str] = None,
+    allow_multiple: bool = False,
 ):
-    with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint:
+    with endpoint_create_start(
+        env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple
+    ) as endpoint:
         before_conn = time.time()
         conn = await endpoint.connect_async()
         res = await conn.fetch(query)
@@ -347,6 +358,7 @@ class BackgroundCompute(object):
                     self.branch,
                     f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key",
                     pgdir_name=f"bgcompute{self.index}_key{verify_key}",
+                    allow_multiple=True,
                 )
                 log.info(f"result: {res}")
                 if len(res) != 1: